diff --git a/examples/pydantic_ai_examples/evals/example_03_unit_testing.py b/examples/pydantic_ai_examples/evals/example_03_unit_testing.py index e995933944..646f24f597 100644 --- a/examples/pydantic_ai_examples/evals/example_03_unit_testing.py +++ b/examples/pydantic_ai_examples/evals/example_03_unit_testing.py @@ -29,7 +29,9 @@ def evaluate_dataset(): report = dataset.evaluate_sync(infer_time_range) print(report) - assertion_pass_rate = report.averages().assertions + averages = report.averages() + assert averages is not None + assertion_pass_rate = averages.assertions assert assertion_pass_rate is not None, 'There should be at least one assertion' assert assertion_pass_rate > 0.9, ( f'The assertion pass rate was {assertion_pass_rate:.1%}; it should be above 90%.' diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 5aa040cc27..acfd4ec64f 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -13,6 +13,7 @@ import inspect import sys import time +import traceback import warnings from collections.abc import Awaitable, Mapping, Sequence from contextlib import AsyncExitStack, nullcontext @@ -20,7 +21,7 @@ from dataclasses import dataclass, field from inspect import iscoroutinefunction from pathlib import Path -from typing import Any, Callable, Generic, Literal, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union, cast import anyio import logfire_api @@ -40,10 +41,14 @@ from .evaluators._run_evaluator import run_evaluator from .evaluators.common import DEFAULT_EVALUATORS from .evaluators.context import EvaluatorContext +from .evaluators.evaluator import EvaluatorFailure from .evaluators.spec import EvaluatorSpec from .otel import SpanTree from .otel._context_subtree import context_subtree -from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate +from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure + +if TYPE_CHECKING: + from tenacity import AsyncRetrying if sys.version_info < (3, 11): from exceptiongroup import ExceptionGroup # pragma: lax no cover @@ -84,6 +89,7 @@ _REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase]) +_REPORT_CASE_FAILURES_ADAPTER = TypeAdapter(list[ReportCaseFailure]) _REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate) @@ -171,11 +177,6 @@ def __init__( self.evaluators = list(evaluators) -# TODO: Consider making one or more of the following changes to this type: -# * Add `task: Callable[[InputsT], Awaitable[OutputT]` as a field -# * Add `inputs_type`, `output_type`, etc. as kwargs on `__init__` -# * Rename to `Evaluation` -# TODO: Allow `task` to be sync _or_ async class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', arbitrary_types_allowed=True): """A dataset of test [cases][pydantic_evals.Case]. @@ -263,6 +264,7 @@ async def evaluate( name: str | None = None, max_concurrency: int | None = None, progress: bool = True, + retry: AsyncRetrying | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: """Evaluates the test cases in the dataset using the given task. @@ -277,6 +279,7 @@ async def evaluate( max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. progress: Whether to show a progress bar for the evaluation. Defaults to `True`. + retry: Optional retry configuration for the task execution. Returns: A report containing the results of the evaluation. @@ -292,24 +295,30 @@ async def evaluate( async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str): async with limiter: - result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators) + result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators, retry) if progress_bar and task_id is not None: # pragma: no branch progress_bar.update(task_id, advance=1) return result + cases_and_failures = await task_group_gather( + [ + lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}') + for i, case in enumerate(self.cases, 1) + ] + ) report = EvaluationReport( name=name, - cases=await task_group_gather( - [ - lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}') - for i, case in enumerate(self.cases, 1) - ] - ), + cases=[x for x in cases_and_failures if isinstance(x, ReportCase)], + failures=[x for x in cases_and_failures if isinstance(x, ReportCaseFailure)], ) # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel: eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases)) + # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel: + eval_span.set_attribute('failures', _REPORT_CASE_FAILURES_ADAPTER.dump_python(report.failures)) # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel - eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages())) + averages = report.averages() + if averages: + eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(averages)) return report def evaluate_sync( @@ -817,13 +826,16 @@ def record_attribute(self, name: str, value: Any) -> None: async def _run_task( - task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT] + task: Callable[[InputsT], Awaitable[OutputT] | OutputT], + case: Case[InputsT, OutputT, MetadataT], + retry: AsyncRetrying | None = None, ) -> EvaluatorContext[InputsT, OutputT, MetadataT]: """Run a task on a case and return the context for evaluators. Args: task: The task to run. case: The case to run the task on. + retry: The retry strategy to use. Returns: An EvaluatorContext containing the inputs, actual output, expected output, and metadata. @@ -831,24 +843,38 @@ async def _run_task( Raises: Exception: Any exception raised by the task. """ - task_run = _TaskRun() - if _CURRENT_TASK_RUN.get() is not None: # pragma: no cover - raise RuntimeError('A task run has already been entered. Task runs should not be nested') - # Note: the current behavior is for task execution errors to just bubble up all the way and kill the evaluation. - # Should we handle them for the user in some way? If so, I guess we'd want to do that here. - token = _CURRENT_TASK_RUN.set(task_run) - try: - with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span: - with context_subtree() as span_tree: + async def _run_once(): + task_run_ = _TaskRun() + if _CURRENT_TASK_RUN.get() is not None: # pragma: no cover + raise RuntimeError('A task run has already been entered. Task runs should not be nested') + + token = _CURRENT_TASK_RUN.set(task_run_) + try: + with ( + _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span, + context_subtree() as span_tree_, + ): t0 = time.perf_counter() if iscoroutinefunction(task): - task_output = cast(OutputT, await task(case.inputs)) + task_output_ = cast(OutputT, await task(case.inputs)) else: - task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs)) + task_output_ = cast(OutputT, await to_thread.run_sync(task, case.inputs)) fallback_duration = time.perf_counter() - t0 - finally: - _CURRENT_TASK_RUN.reset(token) + duration_ = _get_span_duration(task_span, fallback_duration) + return task_run_, task_output_, duration_, span_tree_ + finally: + _CURRENT_TASK_RUN.reset(token) + + async def _run_with_retries(): + if retry: + async for attempt in retry: + with attempt: + return await _run_once() + # Note: the following line will be unreachable if retry is not None + return await _run_once() + + task_run, task_output, duration, span_tree = await _run_with_retries() if isinstance(span_tree, SpanTree): # pragma: no branch # TODO: Question: Should we make this metric-attributes functionality more user-configurable in some way before merging? @@ -863,6 +889,7 @@ async def _run_task( if not isinstance(v, (int, float)): continue # TODO: Revisit this choice to strip the prefix.. + # TODO: Use the span-tracking-of-metrics functionality to simplify this implementation if k.startswith('gen_ai.usage.details.'): task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v) elif k.startswith('gen_ai.usage.'): @@ -874,7 +901,7 @@ async def _run_task( metadata=case.metadata, expected_output=case.expected_output, output=task_output, - duration=_get_span_duration(task_span, fallback_duration), + duration=duration, _span_tree=span_tree, attributes=task_run.attributes, metrics=task_run.metrics, @@ -886,7 +913,8 @@ async def _run_task_and_evaluators( case: Case[InputsT, OutputT, MetadataT], report_case_name: str, dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]], -) -> ReportCase[InputsT, OutputT, MetadataT]: + retry: AsyncRetrying | None, +) -> ReportCase[InputsT, OutputT, MetadataT] | ReportCaseFailure[InputsT, OutputT, MetadataT]: """Run a task on a case and evaluate the results. Args: @@ -894,64 +922,83 @@ async def _run_task_and_evaluators( case: The case to run the task on. report_case_name: The name to use for this case in the report. dataset_evaluators: Evaluators from the dataset to apply to this case. + retry: The retry strategy to use for running the task. Returns: A ReportCase containing the evaluation results. """ - with _logfire.span( - 'case: {case_name}', - task_name=get_unwrapped_function_name(task), - case_name=report_case_name, - inputs=case.inputs, - metadata=case.metadata, - expected_output=case.expected_output, - ) as case_span: - t0 = time.time() - scoring_context = await _run_task(task, case) - - case_span.set_attribute('output', scoring_context.output) - case_span.set_attribute('task_duration', scoring_context.duration) - case_span.set_attribute('metrics', scoring_context.metrics) - case_span.set_attribute('attributes', scoring_context.attributes) - - evaluators = case.evaluators + dataset_evaluators - evaluator_outputs: list[EvaluationResult] = [] - if evaluators: - evaluator_outputs_by_task = await task_group_gather( - [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators] - ) - evaluator_outputs += [out for outputs in evaluator_outputs_by_task for out in outputs] - - assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs) - case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions)) - case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores)) - case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels)) - - context = case_span.context - if context is None: # pragma: no cover - trace_id = '' - span_id = '' - else: - trace_id = f'{context.trace_id:032x}' - span_id = f'{context.span_id:016x}' - fallback_duration = time.time() - t0 - - return ReportCase[InputsT, OutputT, MetadataT]( - name=report_case_name, - inputs=case.inputs, - metadata=case.metadata, - expected_output=case.expected_output, - output=scoring_context.output, - metrics=scoring_context.metrics, - attributes=scoring_context.attributes, - scores=scores, - labels=labels, - assertions=assertions, - task_duration=scoring_context.duration, - total_duration=_get_span_duration(case_span, fallback_duration), - trace_id=trace_id, - span_id=span_id, - ) + trace_id = '' + span_id = '' + try: + with _logfire.span( + 'case: {case_name}', + task_name=get_unwrapped_function_name(task), + case_name=report_case_name, + inputs=case.inputs, + metadata=case.metadata, + expected_output=case.expected_output, + ) as case_span: + context = case_span.context + if context is not None: # pragma: no cover + trace_id = f'{context.trace_id:032x}' + span_id = f'{context.span_id:016x}' + + t0 = time.time() + scoring_context = await _run_task(task, case, retry) + + case_span.set_attribute('output', scoring_context.output) + case_span.set_attribute('task_duration', scoring_context.duration) + case_span.set_attribute('metrics', scoring_context.metrics) + case_span.set_attribute('attributes', scoring_context.attributes) + + evaluators = case.evaluators + dataset_evaluators + evaluator_outputs: list[EvaluationResult] = [] + evaluator_failures: list[EvaluatorFailure] = [] + if evaluators: + evaluator_outputs_by_task = await task_group_gather( + [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators] + ) + for outputs in evaluator_outputs_by_task: + if isinstance(outputs, EvaluatorFailure): + evaluator_failures.append(outputs) + else: + evaluator_outputs.extend(outputs) + + assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs) + case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions)) + case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores)) + case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels)) + + fallback_duration = time.time() - t0 + + return ReportCase[InputsT, OutputT, MetadataT]( + name=report_case_name, + inputs=case.inputs, + metadata=case.metadata, + expected_output=case.expected_output, + output=scoring_context.output, + metrics=scoring_context.metrics, + attributes=scoring_context.attributes, + scores=scores, + labels=labels, + assertions=assertions, + task_duration=scoring_context.duration, + total_duration=_get_span_duration(case_span, fallback_duration), + trace_id=trace_id, + span_id=span_id, + evaluator_failures=evaluator_failures, + ) + except Exception as exc: + return ReportCaseFailure[InputsT, OutputT, MetadataT]( + name=report_case_name, + inputs=case.inputs, + metadata=case.metadata, + expected_output=case.expected_output, + error_message=f'{type(exc).__name__}: {exc}', + error_stacktrace=traceback.format_exc(), + trace_id=trace_id, + span_id=span_id, + ) _evaluation_results_adapter = TypeAdapter(Mapping[str, EvaluationResult]) diff --git a/pydantic_evals/pydantic_evals/evaluators/__init__.py b/pydantic_evals/pydantic_evals/evaluators/__init__.py index 36aa95cae0..beefe51002 100644 --- a/pydantic_evals/pydantic_evals/evaluators/__init__.py +++ b/pydantic_evals/pydantic_evals/evaluators/__init__.py @@ -10,7 +10,7 @@ Python, ) from .context import EvaluatorContext -from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec +from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec __all__ = ( # common @@ -27,6 +27,8 @@ 'EvaluatorContext', # evaluator 'Evaluator', + 'EvaluationReason', + 'EvaluatorFailure', 'EvaluatorOutput', 'EvaluatorSpec', 'EvaluationReason', diff --git a/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py b/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py index b5b58d8cd1..1c9323dd89 100644 --- a/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py +++ b/pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py @@ -1,8 +1,10 @@ from __future__ import annotations +import traceback from collections.abc import Mapping from typing import Any +import logfire_api from pydantic import ( TypeAdapter, ValidationError, @@ -10,16 +12,36 @@ from typing_extensions import TypeVar from .context import EvaluatorContext -from .evaluator import EvaluationReason, EvaluationResult, EvaluationScalar, Evaluator, EvaluatorOutput +from .evaluator import ( + EvaluationReason, + EvaluationResult, + EvaluationScalar, + Evaluator, + EvaluatorFailure, + EvaluatorOutput, +) + +# while waiting for https://github.com/pydantic/logfire/issues/745 +try: + import logfire._internal.stack_info +except ImportError: + pass +else: + from pathlib import Path + + logfire._internal.stack_info.NON_USER_CODE_PREFIXES += (str(Path(__file__).parent.absolute()),) # pyright: ignore[reportPrivateImportUsage] + InputsT = TypeVar('InputsT', default=Any, contravariant=True) OutputT = TypeVar('OutputT', default=Any, contravariant=True) MetadataT = TypeVar('MetadataT', default=Any, contravariant=True) +_logfire = logfire_api.Logfire(otel_scope='pydantic-evals') + async def run_evaluator( evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT] -) -> list[EvaluationResult]: +) -> list[EvaluationResult] | EvaluatorFailure: """Run an evaluator and return the results. This function runs an evaluator on the given context and processes the results into @@ -30,17 +52,29 @@ async def run_evaluator( ctx: The context containing the inputs, outputs, and metadata for evaluation. Returns: - A list of evaluation results. + A list of evaluation results, or an evaluator failure if an exception is raised during its execution. Raises: ValueError: If the evaluator returns a value of an invalid type. """ - raw_results = await evaluator.evaluate_async(ctx) - try: - results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results) - except ValidationError as e: - raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e + with _logfire.span( + 'evaluator: {evaluator_name}', + evaluator_name=evaluator.get_default_evaluation_name(), + ): + raw_results = await evaluator.evaluate_async(ctx) + + try: + results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results) + except ValidationError as e: + raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e + except Exception as e: + return EvaluatorFailure( + name=evaluator.get_default_evaluation_name(), + error_message=f'{type(e).__name__}: {e}', + error_stacktrace=traceback.format_exc(), + source=evaluator.as_spec(), + ) results = _convert_to_mapping(results, scalar_name=evaluator.get_default_evaluation_name()) diff --git a/pydantic_evals/pydantic_evals/evaluators/evaluator.py b/pydantic_evals/pydantic_evals/evaluators/evaluator.py index 8ca514c916..fc11a951cd 100644 --- a/pydantic_evals/pydantic_evals/evaluators/evaluator.py +++ b/pydantic_evals/pydantic_evals/evaluators/evaluator.py @@ -25,6 +25,7 @@ 'EvaluationResult', 'EvaluationScalar', 'Evaluator', + 'EvaluatorFailure', 'EvaluatorOutput', 'EvaluatorSpec', ) @@ -100,6 +101,16 @@ def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None: return None +@dataclass +class EvaluatorFailure: + """Represents a failure raised during the execution of an evaluator.""" + + name: str + error_message: str + error_stacktrace: str + source: EvaluatorSpec + + # Evaluators are contravariant in all of its parameters. InputsT = TypeVar('InputsT', default=Any, contravariant=True) """Type variable for the inputs type of the task being evaluated.""" diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index d013af2fab..a5f64da3f1 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -2,7 +2,7 @@ from collections import defaultdict from collections.abc import Mapping -from dataclasses import dataclass +from dataclasses import dataclass, field from io import StringIO from typing import Any, Callable, Generic, Literal, Protocol, cast @@ -33,6 +33,8 @@ 'ReportCaseAggregate', ) +from ..evaluators.evaluator import EvaluatorFailure + MISSING_VALUE_STR = '[i][/i]' EMPTY_CELL_STR = '-' EMPTY_AGGREGATE_CELL_STR = '' @@ -71,8 +73,30 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]): trace_id: str span_id: str + evaluator_failures: list[EvaluatorFailure] = field(default_factory=list) + + +@dataclass +class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]): + name: str + """The name of the [case][pydantic_evals.Case].""" + inputs: InputsT + """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs].""" + metadata: MetadataT | None + """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata].""" + expected_output: OutputT | None + """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output].""" + + error_message: str + error_stacktrace: str + + # TODO(DavidM): Drop these once we can reference child spans in details panel: + trace_id: str + span_id: str + ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any]) +ReportCaseFailureAdapter = TypeAdapter(ReportCaseFailure[Any, Any, Any]) class ReportCaseAggregate(BaseModel): @@ -160,9 +184,14 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]): """The name of the report.""" cases: list[ReportCase[InputsT, OutputT, MetadataT]] """The cases in the report.""" + failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list) + """The failures in the report. These are cases where task execution raised an exception.""" + # TODO: Need to add failures to the print-out - def averages(self) -> ReportCaseAggregate: - return ReportCaseAggregate.average(self.cases) + def averages(self) -> ReportCaseAggregate | None: + if self.cases: + return ReportCaseAggregate.average(self.cases) + return None def print( self, @@ -177,6 +206,8 @@ def print( include_total_duration: bool = False, include_removed_cases: bool = False, include_averages: bool = True, + include_errors: bool = True, + include_error_stacktrace: bool = False, input_config: RenderValueConfig | None = None, metadata_config: RenderValueConfig | None = None, output_config: RenderValueConfig | None = None, @@ -209,7 +240,19 @@ def print( duration_config=duration_config, include_reasons=include_reasons, ) - Console(width=width).print(table) + console = Console(width=width) + console.print(table) + if include_errors and self.failures: + failures_table = self.failures_table( + include_input=include_input, + include_metadata=include_metadata, + include_expected_output=include_expected_output, + include_error_message=True, + include_error_stacktrace=include_error_stacktrace, + input_config=input_config, + metadata_config=metadata_config, + ) + console.print(failures_table, style='red') def console_table( self, @@ -245,6 +288,8 @@ def console_table( include_total_duration=include_total_duration, include_removed_cases=include_removed_cases, include_averages=include_averages, + include_error_message=False, + include_error_stacktrace=False, input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})}, metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})}, output_config=output_config or _DEFAULT_VALUE_CONFIG, @@ -259,6 +304,40 @@ def console_table( else: # pragma: no cover return renderer.build_diff_table(self, baseline) + def failures_table( + self, + *, + include_input: bool = False, + include_metadata: bool = False, + include_expected_output: bool = False, + include_error_message: bool = True, + include_error_stacktrace: bool = True, + input_config: RenderValueConfig | None = None, + metadata_config: RenderValueConfig | None = None, + ) -> Table: + """Return a table containing the failures in this report.""" + renderer = EvaluationRenderer( + include_input=include_input, + include_metadata=include_metadata, + include_expected_output=include_expected_output, + include_output=False, + include_durations=False, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})}, + metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})}, + output_config=_DEFAULT_VALUE_CONFIG, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config=_DEFAULT_DURATION_CONFIG, + include_reasons=False, + include_error_message=include_error_message, + include_error_stacktrace=include_error_stacktrace, + ) + return renderer.build_failures_table(self) + def __str__(self) -> str: # pragma: lax no cover """Return a string representation of the report.""" table = self.console_table() @@ -538,6 +617,8 @@ class ReportCaseRenderer: include_reasons: bool include_durations: bool include_total_duration: bool + include_error_message: bool + include_error_stacktrace: bool input_renderer: _ValueRenderer metadata_renderer: _ValueRenderer @@ -571,6 +652,22 @@ def build_base_table(self, title: str) -> Table: table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right') return table + def build_failures_table(self, title: str) -> Table: + """Build and return a Rich Table for the failures output.""" + table = Table(title=title, show_lines=True) + table.add_column('Case ID', style='bold') + if self.include_input: + table.add_column('Inputs', overflow='fold') + if self.include_metadata: + table.add_column('Metadata', overflow='fold') + if self.include_expected_output: + table.add_column('Expected Output', overflow='fold') + if self.include_error_message: + table.add_column('Error Message', overflow='fold') + if self.include_error_stacktrace: + table.add_column('Error Stacktrace', overflow='fold') + return table + def build_row(self, case: ReportCase) -> list[str]: """Build a table row for a single case.""" row = [case.name] @@ -742,6 +839,27 @@ def build_diff_aggregate_row( return row + def build_failure_row(self, case: ReportCaseFailure) -> list[str]: + """Build a table row for a single case failure.""" + row = [case.name] + + if self.include_input: + row.append(self.input_renderer.render_value(None, case.inputs) or EMPTY_CELL_STR) + + if self.include_metadata: + row.append(self.metadata_renderer.render_value(None, case.metadata) or EMPTY_CELL_STR) + + if self.include_expected_output: + row.append(self.output_renderer.render_value(None, case.expected_output) or EMPTY_CELL_STR) + + if self.include_error_message: + row.append(case.error_message or EMPTY_CELL_STR) + + if self.include_error_stacktrace: + row.append(case.error_stacktrace or EMPTY_CELL_STR) + + return row + def _render_durations(self, case: ReportCase | ReportCaseAggregate) -> str: """Build the diff string for a duration value.""" case_durations: dict[str, float] = {'task': case.task_duration} @@ -884,6 +1002,9 @@ class EvaluationRenderer: # Data to include include_reasons: bool # only applies to reports, not to diffs + include_error_message: bool + include_error_stacktrace: bool + def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None): return any(case.scores for case in self._all_cases(report, baseline)) @@ -933,6 +1054,8 @@ def _get_case_renderer( include_reasons=self.include_reasons, include_durations=self.include_durations, include_total_duration=self.include_total_duration, + include_error_message=self.include_error_message, + include_error_stacktrace=self.include_error_stacktrace, input_renderer=input_renderer, metadata_renderer=metadata_renderer, output_renderer=output_renderer, @@ -950,7 +1073,9 @@ def build_table(self, report: EvaluationReport) -> Table: if self.include_averages: # pragma: no branch average = report.averages() - table.add_row(*case_renderer.build_aggregate_row(average)) + if average: # pragma: no branch + table.add_row(*case_renderer.build_aggregate_row(average)) + return table def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table: @@ -997,6 +1122,14 @@ def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) return table + def build_failures_table(self, report: EvaluationReport) -> Table: + case_renderer = self._get_case_renderer(report) + table = case_renderer.build_failures_table('Case Failures') + for case in report.failures: + table.add_row(*case_renderer.build_failure_row(case)) + + return table + def _infer_score_renderers( self, report: EvaluationReport, baseline: EvaluationReport | None ) -> dict[str, _NumberRenderer]: diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index e1c98aa556..8ac7abf663 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -11,7 +11,7 @@ from inline_snapshot import snapshot from pydantic import BaseModel, TypeAdapter -from ..conftest import try_import +from ..conftest import IsStr, try_import from .utils import render_table with try_import() as imports_successful: @@ -20,8 +20,17 @@ from pydantic_evals import Case, Dataset from pydantic_evals.dataset import increment_eval_metric, set_eval_attribute - from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec, LLMJudge, Python + from pydantic_evals.evaluators import ( + EvaluationResult, + Evaluator, + EvaluatorFailure, + EvaluatorOutput, + EvaluatorSpec, + LLMJudge, + Python, + ) from pydantic_evals.evaluators.context import EvaluatorContext + from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter, ReportCaseFailure @dataclass class MockEvaluator(Evaluator[object, object, object]): @@ -32,7 +41,6 @@ class MockEvaluator(Evaluator[object, object, object]): def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput: return self.output - from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio] @@ -206,6 +214,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: } }, 'attributes': {}, + 'evaluator_failures': [], 'expected_output': {'answer': '4', 'confidence': 1.0}, 'inputs': {'query': 'What is 2+2?'}, 'labels': {}, @@ -223,7 +232,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: }, 'span_id': '0000000000000003', 'task_duration': 1.0, - 'total_duration': 6.0, + 'total_duration': 10.0, 'trace_id': '00000000000000000000000000000001', } ) @@ -258,6 +267,7 @@ def mock_sync_task(inputs: TaskInput) -> TaskOutput: } }, 'attributes': {}, + 'evaluator_failures': [], 'expected_output': {'answer': '4', 'confidence': 1.0}, 'inputs': {'query': 'What is 2+2?'}, 'labels': {}, @@ -281,6 +291,72 @@ def mock_sync_task(inputs: TaskInput) -> TaskOutput: ) +async def test_evaluate_with_retried_task_failure( + example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata], + simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]], +): + try: + from tenacity import AsyncRetrying, stop_after_attempt + except ImportError: # pragma no cover + # Just pass the test if tenacity isn't installed + return + + example_dataset.add_evaluator(simple_evaluator()) + + attempt = 0 + + async def mock_async_task(inputs: TaskInput) -> TaskOutput: + nonlocal attempt + if attempt < 3: + attempt += 1 + raise RuntimeError(f'failure {attempt}') + if inputs.query == 'What is 2+2?': + return TaskOutput(answer='4') + elif inputs.query == 'What is the capital of France?': + return TaskOutput(answer='Paris') + return TaskOutput(answer='Unknown') # pragma: no cover + + report = await example_dataset.evaluate(mock_async_task, retry=AsyncRetrying(stop=stop_after_attempt(3))) + + assert attempt == 3 + + assert report is not None + assert len(report.cases) == 2 + assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( + { + 'assertions': { + 'correct': { + 'name': 'correct', + 'reason': None, + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, + 'value': True, + } + }, + 'attributes': {}, + 'evaluator_failures': [], + 'expected_output': {'answer': '4', 'confidence': 1.0}, + 'inputs': {'query': 'What is 2+2?'}, + 'labels': {}, + 'metadata': {'category': 'general', 'difficulty': 'easy'}, + 'metrics': {}, + 'name': 'case1', + 'output': {'answer': '4', 'confidence': 1.0}, + 'scores': { + 'confidence': { + 'name': 'confidence', + 'reason': None, + 'source': {'name': 'SimpleEvaluator', 'arguments': None}, + 'value': 1.0, + } + }, + 'span_id': '0000000000000003', + 'task_duration': 1.0, + 'total_duration': 20.0, + 'trace_id': '00000000000000000000000000000001', + } + ) + + async def test_evaluate_with_concurrency( example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata], simple_evaluator: type[Evaluator[TaskInput, TaskOutput, TaskMetadata]], @@ -310,6 +386,7 @@ async def mock_task(inputs: TaskInput) -> TaskOutput: } }, 'attributes': {}, + 'evaluator_failures': [], 'expected_output': {'answer': '4', 'confidence': 1.0}, 'inputs': {'query': 'What is 2+2?'}, 'labels': {}, @@ -327,7 +404,7 @@ async def mock_task(inputs: TaskInput) -> TaskOutput: }, 'span_id': '0000000000000003', 'task_duration': 1.0, - 'total_duration': 3.0, + 'total_duration': 5.0, 'trace_id': '00000000000000000000000000000001', } ) @@ -345,11 +422,49 @@ async def failing_task(inputs: TaskInput) -> TaskOutput: raise ValueError('Task error') return TaskOutput(answer='Paris') - # TODO: Should we include the exception in the result rather than bubbling up the error? - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(failing_task) - assert exc_info.value == HasRepr( - repr(ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Task error')])) + report = await example_dataset.evaluate(failing_task) + assert report.cases == snapshot( + [ + ReportCase( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + output=TaskOutput(answer='Paris', confidence=1.0), + metrics={}, + attributes={}, + scores={ + 'confidence': EvaluationResult( + name='confidence', value=1.0, reason=None, source=simple_evaluator().as_spec() + ) + }, + labels={}, + assertions={ + 'correct': EvaluationResult( + name='correct', value=True, reason=None, source=simple_evaluator().as_spec() + ) + }, + task_duration=1.0, + total_duration=5.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + evaluator_failures=[], + ) + ] + ) + assert report.failures == snapshot( + [ + ReportCaseFailure( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + error_message='ValueError: Task error', + error_stacktrace=IsStr(), + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + ) + ] ) @@ -365,20 +480,60 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]): async def mock_task(inputs: TaskInput) -> TaskOutput: return TaskOutput(answer='4') - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(mock_task) - - assert exc_info.value == HasRepr( - repr( - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Evaluator error')]), - ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Evaluator error')]), + report = await example_dataset.evaluate(mock_task) + assert report.cases == snapshot( + [ + ReportCase( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + output=TaskOutput(answer='4', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=12.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + evaluator_failures=[ + EvaluatorFailure( + name='FailingEvaluator', + error_message='ValueError: Evaluator error', + error_stacktrace=IsStr(), + source=FailingEvaluator().as_spec(), + ) ], - ) - ) + ), + ReportCase( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + output=TaskOutput(answer='4', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=10.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + evaluator_failures=[ + EvaluatorFailure( + name='FailingEvaluator', + error_message='ValueError: Evaluator error', + error_stacktrace=IsStr(), + source=FailingEvaluator().as_spec(), + ) + ], + ), + ] ) + assert report.failures == snapshot([]) async def test_increment_eval_metric(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]): @@ -467,7 +622,7 @@ async def my_task(inputs: TaskInput) -> TaskOutput: }, assertions={}, task_duration=1.0, - total_duration=6.0, + total_duration=18.0, trace_id='00000000000000000000000000000001', span_id='0000000000000003', ), @@ -493,7 +648,7 @@ async def my_task(inputs: TaskInput) -> TaskOutput: }, assertions={}, task_duration=1.0, - total_duration=4.0, + total_duration=16.0, trace_id='00000000000000000000000000000001', span_id='0000000000000007', ), @@ -533,7 +688,7 @@ async def my_task(inputs: TaskInput) -> TaskOutput: }, assertions={}, task_duration=1.0, - total_duration=6.0, + total_duration=10.0, trace_id='00000000000000000000000000000001', span_id='0000000000000003', ), @@ -556,7 +711,7 @@ async def my_task(inputs: TaskInput) -> TaskOutput: }, assertions={}, task_duration=1.0, - total_duration=4.0, + total_duration=8.0, trace_id='00000000000000000000000000000001', span_id='0000000000000007', ), @@ -814,33 +969,68 @@ async def test_invalid_evaluator_output_type(example_dataset: Dataset[TaskInput, async def mock_task(inputs: TaskInput) -> TaskOutput: return TaskOutput(answer='4') - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(mock_task) - assert exc_info.value == HasRepr( - repr( - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ValueError( - "Python(expression='...').evaluate returned a value of an invalid type: Ellipsis." - ) - ], - ), - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ValueError( - "Python(expression='...').evaluate returned a value of an invalid type: Ellipsis." - ) - ], - ), + report = await example_dataset.evaluate(mock_task) + assert report.cases == snapshot( + [ + ReportCase( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + output=TaskOutput(answer='4', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=12.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + evaluator_failures=[ + EvaluatorFailure( + name='Python', + error_message='ValueError: ' + "Python(expression='...').evaluate " + 'returned a value ' + 'of an invalid ' + 'type: Ellipsis.', + error_stacktrace=IsStr(), + source=Python(expression='...').as_spec(), + ) ], - ) - ) + ), + ReportCase( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + output=TaskOutput(answer='4', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=10.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + evaluator_failures=[ + EvaluatorFailure( + name='Python', + error_message='ValueError: ' + "Python(expression='...').evaluate " + 'returned a value ' + 'of an invalid ' + 'type: Ellipsis.', + error_stacktrace=IsStr(), + source=Python(expression='...').as_spec(), + ) + ], + ), + ] ) + assert report.failures == snapshot([]) async def test_dataset_evaluate_with_failing_task(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]): @@ -849,10 +1039,31 @@ async def test_dataset_evaluate_with_failing_task(example_dataset: Dataset[TaskI async def failing_task(inputs: TaskInput) -> TaskOutput: raise ValueError('Task failed') - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(failing_task) - assert exc_info.value == HasRepr( - repr(ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Task failed'), ValueError('Task failed')])) + report = await example_dataset.evaluate(failing_task) + assert report.cases == snapshot([]) + assert report.failures == snapshot( + [ + ReportCaseFailure( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + error_message='ValueError: Task failed', + error_stacktrace=IsStr(), + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + ), + ReportCaseFailure( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + error_message='ValueError: Task failed', + error_stacktrace=IsStr(), + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + ), + ] ) @@ -868,19 +1079,60 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) - async def task(inputs: TaskInput) -> TaskOutput: return TaskOutput(answer=inputs.query.upper()) - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(task) - assert exc_info.value == HasRepr( - repr( - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Evaluator failed')]), - ExceptionGroup('unhandled errors in a TaskGroup', [ValueError('Evaluator failed')]), + report = await example_dataset.evaluate(task) + assert report.cases == snapshot( + [ + ReportCase( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + output=TaskOutput(answer='WHAT IS 2+2?', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=12.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + evaluator_failures=[ + EvaluatorFailure( + name='FailingEvaluator', + error_message='ValueError: Evaluator failed', + error_stacktrace=IsStr(), + source=FailingEvaluator().as_spec(), + ) ], - ) - ) + ), + ReportCase( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + output=TaskOutput(answer='WHAT IS THE CAPITAL OF FRANCE?', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=10.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + evaluator_failures=[ + EvaluatorFailure( + name='FailingEvaluator', + error_message='ValueError: Evaluator failed', + error_stacktrace=IsStr(), + source=FailingEvaluator().as_spec(), + ) + ], + ), + ] ) + assert report.failures == snapshot([]) async def test_dataset_evaluate_with_invalid_evaluator_result( @@ -901,33 +1153,70 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) - async def task(inputs: TaskInput) -> TaskOutput: return TaskOutput(answer=inputs.query.upper()) - with pytest.raises(ExceptionGroup) as exc_info: - await example_dataset.evaluate(task) - assert exc_info.value == HasRepr( - repr( - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ValueError( - 'test_dataset_evaluate_with_invalid_evaluator_result..InvalidEvaluator().evaluate returned a value of an invalid type: test_dataset_evaluate_with_invalid_evaluator_result..MyObject().' - ) - ], - ), - ExceptionGroup( - 'unhandled errors in a TaskGroup', - [ - ValueError( - 'test_dataset_evaluate_with_invalid_evaluator_result..InvalidEvaluator().evaluate returned a value of an invalid type: test_dataset_evaluate_with_invalid_evaluator_result..MyObject().' - ) - ], - ), + report = await example_dataset.evaluate(task) + assert report.cases == snapshot( + [ + ReportCase( + name='case1', + inputs=TaskInput(query='What is 2+2?'), + metadata=TaskMetadata(difficulty='easy', category='general'), + expected_output=TaskOutput(answer='4', confidence=1.0), + output=TaskOutput(answer='WHAT IS 2+2?', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=12.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000003', + evaluator_failures=[ + EvaluatorFailure( + name='InvalidEvaluator', + error_message='ValueError: ' + 'test_dataset_evaluate_with_invalid_evaluator_result..InvalidEvaluator().evaluate ' + 'returned a value ' + 'of an invalid ' + 'type: ' + 'test_dataset_evaluate_with_invalid_evaluator_result..MyObject().', + error_stacktrace=IsStr(), + source=InvalidEvaluator().as_spec(), + ) ], - ) - ) + ), + ReportCase( + name='case2', + inputs=TaskInput(query='What is the capital of France?'), + metadata=TaskMetadata(difficulty='medium', category='geography'), + expected_output=TaskOutput(answer='Paris', confidence=1.0), + output=TaskOutput(answer='WHAT IS THE CAPITAL OF FRANCE?', confidence=1.0), + metrics={}, + attributes={}, + scores={}, + labels={}, + assertions={}, + task_duration=1.0, + total_duration=10.0, + trace_id='00000000000000000000000000000001', + span_id='0000000000000007', + evaluator_failures=[ + EvaluatorFailure( + name='InvalidEvaluator', + error_message='ValueError: ' + 'test_dataset_evaluate_with_invalid_evaluator_result..InvalidEvaluator().evaluate ' + 'returned a value ' + 'of an invalid ' + 'type: ' + 'test_dataset_evaluate_with_invalid_evaluator_result..MyObject().', + error_stacktrace=IsStr(), + source=InvalidEvaluator().as_spec(), + ) + ], + ), + ] ) + assert report.failures == snapshot([]) async def test_dataset_evaluate_with_custom_name(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]): @@ -1177,22 +1466,23 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: assert spans == [ { 'attributes': { - 'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":5.0}', + 'averages': '{"name":"Averages","scores":{"confidence":1.0},"labels":{},"metrics":{},"assertions":1.0,"task_duration":1.0,"total_duration":9.0}', 'cases': '[{"name":"case1","inputs":{"query":"What is ' - '2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":6.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003"},{"name":"case2","inputs":{"query":"What ' + '2+2?"},"metadata":{"difficulty":"easy","category":"general"},"expected_output":{"answer":"4","confidence":1.0},"output":{"answer":"4","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":10.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000003","evaluator_failures":[]},{"name":"case2","inputs":{"query":"What ' 'is the capital of ' - 'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":4.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007"}]', + 'France?"},"metadata":{"difficulty":"medium","category":"geography"},"expected_output":{"answer":"Paris","confidence":1.0},"output":{"answer":"Paris","confidence":1.0},"metrics":{},"attributes":{},"scores":{"confidence":{"name":"confidence","value":1.0,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"labels":{},"assertions":{"correct":{"name":"correct","value":true,"reason":null,"source":{"name":"SimpleEvaluator","arguments":null}}},"task_duration":1.0,"total_duration":8.0,"trace_id":"00000000000000000000000000000001","span_id":"0000000000000007","evaluator_failures":[]}]', 'code.filepath': 'test_dataset.py', 'code.function': 'test_evaluate_async_logfire', 'code.lineno': 123, - 'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"averages":{"type":"object"}}}', + 'failures': '[]', + 'logfire.json_schema': '{"type":"object","properties":{"name":{},"cases":{"type":"array"},"failures":{"type":"array"},"averages":{"type":"object"}}}', 'logfire.msg': 'evaluate mock_async_task', 'logfire.msg_template': 'evaluate {name}', 'logfire.span_type': 'span', 'name': 'mock_async_task', }, 'context': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, - 'end_time': 10000000000, + 'end_time': 14000000000, 'name': 'evaluate {name}', 'parent': None, 'start_time': 1000000000, @@ -1218,7 +1508,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'task_name': 'mock_async_task', }, 'context': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, - 'end_time': 8000000000, + 'end_time': 12000000000, 'name': 'case: {case_name}', 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, 'start_time': 2000000000, @@ -1261,7 +1551,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'task_name': 'mock_async_task', }, 'context': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, - 'end_time': 9000000000, + 'end_time': 13000000000, 'name': 'case: {case_name}', 'parent': {'is_remote': False, 'span_id': 1, 'trace_id': 1}, 'start_time': 5000000000, @@ -1283,4 +1573,36 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'start_time': 6000000000, } ), + IsPartialDict( + { + 'attributes': { + 'evaluator_name': 'SimpleEvaluator', + 'logfire.json_schema': '{"type":"object","properties":{"evaluator_name":{}}}', + 'logfire.msg': 'evaluator: SimpleEvaluator', + 'logfire.msg_template': 'evaluator: {evaluator_name}', + 'logfire.span_type': 'span', + }, + 'context': {'is_remote': False, 'span_id': 11, 'trace_id': 1}, + 'end_time': 9000000000, + 'name': 'evaluator: {evaluator_name}', + 'parent': {'is_remote': False, 'span_id': 3, 'trace_id': 1}, + 'start_time': 8000000000, + } + ), + IsPartialDict( + { + 'attributes': { + 'evaluator_name': 'SimpleEvaluator', + 'logfire.json_schema': '{"type":"object","properties":{"evaluator_name":{}}}', + 'logfire.msg': 'evaluator: SimpleEvaluator', + 'logfire.msg_template': 'evaluator: {evaluator_name}', + 'logfire.span_type': 'span', + }, + 'context': {'is_remote': False, 'span_id': 13, 'trace_id': 1}, + 'end_time': 11000000000, + 'name': 'evaluator: {evaluator_name}', + 'parent': {'is_remote': False, 'span_id': 7, 'trace_id': 1}, + 'start_time': 10000000000, + } + ), ] diff --git a/tests/evals/test_evaluator_base.py b/tests/evals/test_evaluator_base.py index 850fcc24fd..db3dd55719 100644 --- a/tests/evals/test_evaluator_base.py +++ b/tests/evals/test_evaluator_base.py @@ -1,8 +1,9 @@ from __future__ import annotations as _annotations import asyncio +from collections.abc import Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Union import pytest from inline_snapshot import snapshot @@ -17,6 +18,7 @@ EvaluationReason, EvaluationResult, Evaluator, + EvaluatorFailure, ) from pydantic_evals.otel._errors import SpanTreeRecordingError @@ -240,7 +242,9 @@ async def test_run_evaluator(): # Test with simple boolean result evaluator = SimpleEvaluator() results = await run_evaluator(evaluator, ctx) - adapter = TypeAdapter(list[EvaluationResult]) + adapter = TypeAdapter[Union[Sequence[EvaluationResult], EvaluatorFailure]]( + Union[Sequence[EvaluationResult], EvaluatorFailure] + ) assert adapter.dump_python(results) == snapshot( [ { diff --git a/tests/evals/test_evaluators.py b/tests/evals/test_evaluators.py index c6d3529794..b720b04638 100644 --- a/tests/evals/test_evaluators.py +++ b/tests/evals/test_evaluators.py @@ -12,7 +12,7 @@ from pydantic_ai.models import Model, ModelRequestParameters from pydantic_ai.settings import ModelSettings -from ..conftest import try_import +from ..conftest import IsStr, try_import with try_import() as imports_successful: import logfire @@ -32,7 +32,9 @@ from pydantic_evals.evaluators.context import EvaluatorContext from pydantic_evals.evaluators.evaluator import ( EvaluationReason, + EvaluationResult, Evaluator, + EvaluatorFailure, EvaluatorOutput, ) from pydantic_evals.evaluators.spec import EvaluatorSpec @@ -158,11 +160,14 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) - evaluator = ExampleEvaluator() results = await run_evaluator(evaluator, test_context) + assert not isinstance(results, EvaluatorFailure) assert len(results) == 1 - assert results[0].name == 'result' - assert results[0].value == 'passed' - assert results[0].reason is None - assert results[0].source == EvaluatorSpec(name='ExampleEvaluator', arguments=None) + first_result = results[0] + assert isinstance(first_result, EvaluationResult) + assert first_result.name == 'result' + assert first_result.value == 'passed' + assert first_result.reason is None + assert first_result.source == EvaluatorSpec(name='ExampleEvaluator', arguments=None) async def test_is_instance_evaluator(): @@ -289,8 +294,13 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) - evaluator = FailingEvaluator() # When called directly, it should raise an error - with pytest.raises(ValueError, match='Simulated error'): - await run_evaluator(evaluator, test_context) + result = await run_evaluator(evaluator, test_context) + assert result == EvaluatorFailure( + name='FailingEvaluator', + error_message='ValueError: Simulated error', + error_stacktrace=IsStr(), + source=FailingEvaluator().as_spec(), + ) async def test_evaluator_with_null_values(): diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 55cde3c389..575a1d9081 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -121,6 +121,8 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport): metric_configs={}, duration_config={}, include_reasons=False, + include_error_message=False, + include_error_stacktrace=False, ) table = renderer.build_table(sample_report) @@ -157,6 +159,8 @@ async def test_evaluation_renderer_with_reasons(sample_report: EvaluationReport) metric_configs={}, duration_config={}, include_reasons=True, + include_error_message=False, + include_error_stacktrace=False, ) table = renderer.build_table(sample_report) @@ -230,6 +234,8 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport metric_configs={}, duration_config={}, include_reasons=False, + include_error_message=False, + include_error_stacktrace=False, ) table = renderer.build_diff_table(sample_report, baseline_report) @@ -288,6 +294,8 @@ async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationR metric_configs={}, duration_config={}, include_reasons=False, + include_error_message=False, + include_error_stacktrace=False, ) table = renderer.build_diff_table(sample_report, baseline_report) @@ -352,6 +360,8 @@ async def test_evaluation_renderer_with_custom_configs(sample_report: Evaluation 'diff_decrease_style': 'bold green', }, include_reasons=False, + include_error_message=False, + include_error_stacktrace=False, ) table = renderer.build_table(sample_report) diff --git a/tests/evals/test_reports.py b/tests/evals/test_reports.py index 2d06b99d45..6bec610e17 100644 --- a/tests/evals/test_reports.py +++ b/tests/evals/test_reports.py @@ -215,6 +215,7 @@ async def test_report_with_error(mock_evaluator: Evaluator[TaskInput, TaskOutput } }, 'attributes': {'error': 'Division by zero'}, + 'evaluator_failures': [], 'expected_output': {'answer': 'Error'}, 'inputs': {'query': 'What is 1/0?'}, 'labels': {}, diff --git a/tests/test_direct.py b/tests/test_direct.py index e9a131ea33..d15cc24b68 100644 --- a/tests/test_direct.py +++ b/tests/test_direct.py @@ -116,7 +116,7 @@ def test_model_request_stream_sync_without_context_manager(): """Test that accessing properties or iterating without context manager raises RuntimeError.""" messages: list[ModelMessage] = [ModelRequest.user_text_prompt('x')] - expected_error_msg = re.escape( + expected_error_message = re.escape( 'StreamedResponseSync must be used as a context manager. Use: `with model_request_stream_sync(...) as stream:`' ) @@ -126,22 +126,22 @@ def test_model_request_stream_sync_without_context_manager(): assert 'StreamedResponseSync' in stream_repr assert 'context_entered=False' in stream_repr - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): _ = stream_cm.model_name - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): _ = stream_cm.timestamp - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): stream_cm.get() - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): stream_cm.usage() - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): list(stream_cm) - with pytest.raises(RuntimeError, match=expected_error_msg): + with pytest.raises(RuntimeError, match=expected_error_message): for _ in stream_cm: break # pragma: no cover