Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,16 @@ jobs:
install:
- name: pydantic-ai-slim
command: "--package pydantic-ai-slim"
pytest-args: "--ignore=tests/evals"
- name: pydantic-evals
command: "--package pydantic-evals"
pytest-args: "tests/evals"
- name: standard
command: ""
pytest-args: ""
- name: all-extras
command: "--all-extras"
pytest-args: ""
env:
UV_PYTHON: ${{ matrix.python-version }}
CI: true
Expand All @@ -163,7 +169,7 @@ jobs:
- run: mkdir .coverage

- run: uv sync --only-dev
- run: uv run ${{ matrix.install.command }} coverage run -m pytest -n auto --dist=loadgroup
- run: uv run ${{ matrix.install.command }} coverage run -m pytest -n auto --dist=loadgroup ${{ matrix.install.pytest-args }}
env:
COVERAGE_FILE: .coverage/.coverage.${{ matrix.python-version }}-${{ matrix.install.name }}

Expand Down
2 changes: 0 additions & 2 deletions pydantic_evals/pydantic_evals/otel/_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ class SpanTreeRecordingError(Exception):

This will either be due to missing dependencies or because a tracer provider had not been set.
"""

pass
31 changes: 16 additions & 15 deletions tests/evals/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,38 @@
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from typing import TYPE_CHECKING, Any

import pytest
from dirty_equals import HasRepr, IsNumber, IsPartialDict
from inline_snapshot import snapshot
from pydantic import BaseModel, TypeAdapter

from ..conftest import try_import
from pydantic_evals import Case, Dataset
from pydantic_evals.dataset import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec, LLMJudge, Python
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter

from .utils import render_table

with try_import() as imports_successful:
if TYPE_CHECKING:
import logfire
from logfire.testing import CaptureLogfire

from pydantic_evals import Case, Dataset
from pydantic_evals.dataset import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec, LLMJudge, Python
from pydantic_evals.evaluators.context import EvaluatorContext
logfire = pytest.importorskip('logfire')

@dataclass
class MockEvaluator(Evaluator[object, object, object]):
"""This is just for testing purposes. It just returns the wrapped value."""
pytestmark = [pytest.mark.anyio]

output: EvaluatorOutput

def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
return self.output
@dataclass
class MockEvaluator(Evaluator[object, object, object]):
"""This is just for testing purposes. It just returns the wrapped value."""

from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter
output: EvaluatorOutput

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
return self.output


if sys.version_info < (3, 11):
Expand Down
71 changes: 32 additions & 39 deletions tests/evals/test_evaluator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,45 @@

import asyncio
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
from typing import Any

import pytest
from inline_snapshot import snapshot
from pydantic import TypeAdapter

from ..conftest import try_import
from pydantic_evals.evaluators._run_evaluator import run_evaluator
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.evaluators.evaluator import EvaluationReason, EvaluationResult, Evaluator
from pydantic_evals.otel._errors import SpanTreeRecordingError

pytestmark = [pytest.mark.anyio]


@dataclass
class SimpleEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
reason: str | None = None

def evaluate(self, ctx: EvaluatorContext) -> bool | EvaluationReason:
if self.reason is not None:
return EvaluationReason(value=self.value, reason=self.reason)
return self.value

with try_import() as imports_successful:
from pydantic_evals.evaluators._run_evaluator import run_evaluator
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.evaluators.evaluator import (
EvaluationReason,
EvaluationResult,
Evaluator,
)
from pydantic_evals.otel._errors import SpanTreeRecordingError

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
@dataclass
class AsyncEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
delay: float = 0.1

async def evaluate(self, ctx: EvaluatorContext) -> bool:
await asyncio.sleep(self.delay)
return self.value


@dataclass
class MultiEvaluator(Evaluator[Any, Any, Any]):
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool]:
return {'test1': True, 'test2': False}


def test_evaluation_reason():
Expand Down Expand Up @@ -86,33 +106,6 @@ class InvalidEvaluator(Evaluator[Any, Any, Any]): # pyright: ignore[reportUnuse
assert 'evaluate' in str(exc_info.value)


if TYPE_CHECKING or imports_successful(): # pragma: no branch

@dataclass
class SimpleEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
reason: str | None = None

def evaluate(self, ctx: EvaluatorContext) -> bool | EvaluationReason:
if self.reason is not None:
return EvaluationReason(value=self.value, reason=self.reason)
return self.value

@dataclass
class AsyncEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
delay: float = 0.1

async def evaluate(self, ctx: EvaluatorContext) -> bool:
await asyncio.sleep(self.delay)
return self.value

@dataclass
class MultiEvaluator(Evaluator[Any, Any, Any]):
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool]:
return {'test1': True, 'test2': False}


async def test_evaluator_sync():
"""Test synchronous evaluator execution."""
ctx = EvaluatorContext(
Expand Down
63 changes: 29 additions & 34 deletions tests/evals/test_evaluator_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,38 @@
from pytest_mock import MockerFixture

from pydantic_ai.settings import ModelSettings

from ..conftest import try_import

with try_import() as imports_successful:
from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
from pydantic_evals.evaluators.common import (
DEFAULT_EVALUATORS,
Contains,
Equals,
EqualsExpected,
HasMatchingSpan,
IsInstance,
LLMJudge,
MaxDuration,
OutputConfig,
Python,
)
from pydantic_evals.otel._context_subtree import context_subtree
from pydantic_evals.otel._errors import SpanTreeRecordingError
from pydantic_evals.otel.span_tree import SpanQuery

if TYPE_CHECKING:
import logfire
from logfire.testing import CaptureLogfire

from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext
from pydantic_evals.evaluators.common import (
DEFAULT_EVALUATORS,
Contains,
Equals,
EqualsExpected,
HasMatchingSpan,
IsInstance,
LLMJudge,
MaxDuration,
OutputConfig,
Python,
)
from pydantic_evals.otel._context_in_memory_span_exporter import context_subtree
from pydantic_evals.otel._errors import SpanTreeRecordingError
from pydantic_evals.otel.span_tree import SpanQuery

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]


if TYPE_CHECKING or imports_successful():

class MockContext(EvaluatorContext[Any, Any, Any]):
def __init__(self, output: Any = None, expected_output: Any = None, inputs: Any = None, duration: float = 0.0):
self.output = output
self.expected_output = expected_output
self.inputs = inputs
self.duration = duration
else:
MockContext = object # pragma: lax no cover
logfire = pytest.importorskip('logfire')

pytestmark = [pytest.mark.anyio]


class MockContext(EvaluatorContext[Any, Any, Any]):
def __init__(self, output: Any = None, expected_output: Any = None, inputs: Any = None, duration: float = 0.0):
self.output = output
self.expected_output = expected_output
self.inputs = inputs
self.duration = duration


async def test_equals():
Expand Down
11 changes: 4 additions & 7 deletions tests/evals/test_evaluator_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,11 @@

import pytest

from ..conftest import try_import
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.otel._errors import SpanTreeRecordingError
from pydantic_evals.otel.span_tree import SpanTree

with try_import() as imports_successful:
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.otel._errors import SpanTreeRecordingError
from pydantic_evals.otel.span_tree import SpanTree

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
pytestmark = [pytest.mark.anyio]


def test_evaluator_context_basic():
Expand Down
13 changes: 5 additions & 8 deletions tests/evals/test_evaluator_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@
import pytest
from pydantic import ValidationError

from ..conftest import try_import
from pydantic_evals.evaluators.spec import (
EvaluatorSpec,
_SerializedEvaluatorSpec, # pyright: ignore[reportPrivateUsage]
)

with try_import() as imports_successful:
from pydantic_evals.evaluators.spec import (
EvaluatorSpec,
_SerializedEvaluatorSpec, # pyright: ignore[reportPrivateUsage]
)

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
pytestmark = [pytest.mark.anyio]


def test_evaluator_spec_basic():
Expand Down
47 changes: 21 additions & 26 deletions tests/evals/test_evaluators.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations as _annotations

from dataclasses import dataclass
from typing import Any, cast
from typing import TYPE_CHECKING, Any, cast

import pytest
from inline_snapshot import snapshot
Expand All @@ -11,35 +11,30 @@
from pydantic_ai.messages import ModelMessage, ModelResponse
from pydantic_ai.models import Model, ModelRequestParameters
from pydantic_ai.settings import ModelSettings

from ..conftest import try_import

with try_import() as imports_successful:
from pydantic_evals.evaluators._run_evaluator import run_evaluator
from pydantic_evals.evaluators.common import (
Contains,
Equals,
EqualsExpected,
HasMatchingSpan,
IsInstance,
LLMJudge,
MaxDuration,
Python,
)
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.evaluators.evaluator import EvaluationReason, Evaluator, EvaluatorOutput
from pydantic_evals.evaluators.spec import EvaluatorSpec
from pydantic_evals.otel._context_subtree import context_subtree
from pydantic_evals.otel.span_tree import SpanQuery, SpanTree

if TYPE_CHECKING:
import logfire
from logfire.testing import CaptureLogfire

from pydantic_evals.evaluators._run_evaluator import run_evaluator
from pydantic_evals.evaluators.common import (
Contains,
Equals,
EqualsExpected,
HasMatchingSpan,
IsInstance,
LLMJudge,
MaxDuration,
Python,
)
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.evaluators.evaluator import (
EvaluationReason,
Evaluator,
EvaluatorOutput,
)
from pydantic_evals.evaluators.spec import EvaluatorSpec
from pydantic_evals.otel._context_in_memory_span_exporter import context_subtree
from pydantic_evals.otel.span_tree import SpanQuery, SpanTree
logfire = pytest.importorskip('logfire')

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
pytestmark = [pytest.mark.anyio]


class TaskInput(BaseModel):
Expand Down
27 changes: 13 additions & 14 deletions tests/evals/test_llm_as_a_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@
from inline_snapshot import snapshot
from pytest_mock import MockerFixture

from ..conftest import BinaryContent, try_import

with try_import() as imports_successful:
from pydantic_ai.settings import ModelSettings
from pydantic_evals.evaluators.llm_as_a_judge import (
GradingOutput,
_stringify, # pyright: ignore[reportPrivateUsage]
judge_input_output,
judge_input_output_expected,
judge_output,
judge_output_expected,
)

pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
from pydantic_ai.settings import ModelSettings
from pydantic_evals.evaluators.llm_as_a_judge import (
GradingOutput,
_stringify, # pyright: ignore[reportPrivateUsage]
judge_input_output,
judge_input_output_expected,
judge_output,
judge_output_expected,
)

from ..conftest import BinaryContent

pytestmark = [pytest.mark.anyio]


def test_grading_output():
Expand Down
Loading