Skip to content

Commit b0b28fd

Browse files
hidai25claude
andcommitted
Fix CI failures: Python 3.9 type hints and pytest-asyncio compatibility
- Update type hints in hallucination_evaluator.py and safety_evaluator.py to use Tuple/List from typing module (Python 3.9 compatible) - Remove deprecated event_loop fixture from conftest.py (incompatible with asyncio_mode=auto in pytest-asyncio >= 0.23) - Add mock for moderation API in mock_openai_client fixture - Add missing mocks for hallucination/safety evaluators in test_main_evaluator.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent cdc3544 commit b0b28fd

File tree

4 files changed

+29
-16
lines changed

4 files changed

+29
-16
lines changed

evalview/evaluators/hallucination_evaluator.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Hallucination detection evaluator."""
22

33
import os
4-
from typing import Optional
4+
from typing import Optional, Tuple, List
55
from openai import AsyncOpenAI
66

77
from evalview.core.types import (
@@ -79,7 +79,7 @@ async def evaluate(
7979

8080
async def _detect_hallucination(
8181
self, test_case: TestCase, trace: ExecutionTrace
82-
) -> tuple[bool, float, str]:
82+
) -> Tuple[bool, float, str]:
8383
"""
8484
Detect hallucinations using multiple strategies.
8585
@@ -121,7 +121,7 @@ async def _detect_hallucination(
121121

122122
return has_hallucination, confidence, details
123123

124-
def _check_tool_consistency(self, trace: ExecutionTrace) -> list[str]:
124+
def _check_tool_consistency(self, trace: ExecutionTrace) -> List[str]:
125125
"""
126126
Check if agent output is consistent with tool results.
127127
@@ -254,7 +254,7 @@ def _format_tool_results(self, tool_results: list) -> str:
254254

255255
def _check_uncertainty_handling(
256256
self, test_case: TestCase, trace: ExecutionTrace
257-
) -> list[str]:
257+
) -> List[str]:
258258
"""
259259
Check if agent properly acknowledges uncertainty.
260260

evalview/evaluators/safety_evaluator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import os
44
import re
5-
from typing import Optional
5+
from typing import Optional, Tuple, List
66
from openai import AsyncOpenAI
77

88
from evalview.core.types import (
@@ -83,7 +83,7 @@ async def evaluate(
8383

8484
async def _check_safety(
8585
self, test_case: TestCase, trace: ExecutionTrace, config: SafetyCheck
86-
) -> tuple[bool, list[str], str, str]:
86+
) -> Tuple[bool, List[str], str, str]:
8787
"""
8888
Perform comprehensive safety checks.
8989
@@ -271,7 +271,7 @@ def _detect_pii(self, text: str) -> dict:
271271
"types": ", ".join(pii_types) if pii_types else "none",
272272
}
273273

274-
async def _llm_safety_check(self, text: str, categories: list[str]) -> dict:
274+
async def _llm_safety_check(self, text: str, categories: List[str]) -> dict:
275275
"""
276276
Use LLM for nuanced safety evaluation.
277277

tests/conftest.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,16 @@ def mock_openai_client() -> AsyncMock:
242242
)
243243
]
244244
mock_client.chat.completions.create.return_value = mock_response
245+
246+
# Mock moderation API for safety evaluator
247+
mock_moderation_response = MagicMock()
248+
mock_moderation_result = MagicMock()
249+
mock_moderation_result.flagged = False
250+
mock_moderation_result.categories = MagicMock()
251+
mock_moderation_result.categories.model_dump.return_value = {}
252+
mock_moderation_response.results = [mock_moderation_result]
253+
mock_client.moderations.create.return_value = mock_moderation_response
254+
245255
return mock_client
246256

247257

@@ -362,12 +372,5 @@ def temp_yaml_directory(tmp_path: Path) -> Path:
362372
# Async Testing Utilities
363373
# ============================================================================
364374

365-
366-
@pytest.fixture
367-
def event_loop():
368-
"""Create an event loop for async tests."""
369-
import asyncio
370-
371-
loop = asyncio.new_event_loop()
372-
yield loop
373-
loop.close()
375+
# Note: event_loop fixture removed - pytest-asyncio with asyncio_mode=auto
376+
# handles this automatically. The custom fixture is deprecated in pytest-asyncio >= 0.23

tests/test_main_evaluator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ async def test_evaluate_all_pass(self, sample_test_case, sample_execution_trace,
3434
"""Test complete evaluation when all criteria pass."""
3535
evaluator = Evaluator()
3636
evaluator.output_evaluator.client = mock_openai_client
37+
evaluator.hallucination_evaluator.client = mock_openai_client
38+
evaluator.safety_evaluator.client = mock_openai_client
3739

3840
result = await evaluator.evaluate(sample_test_case, sample_execution_trace)
3941

@@ -51,6 +53,8 @@ async def test_evaluate_creates_all_evaluations(
5153
"""Test that all sub-evaluators are run."""
5254
evaluator = Evaluator()
5355
evaluator.output_evaluator.client = mock_openai_client
56+
evaluator.hallucination_evaluator.client = mock_openai_client
57+
evaluator.safety_evaluator.client = mock_openai_client
5458

5559
result = await evaluator.evaluate(sample_test_case, sample_execution_trace)
5660

@@ -328,6 +332,8 @@ async def test_evaluate_with_boundary_score(self, mock_openai_client):
328332
"""Test evaluation with score exactly at threshold."""
329333
evaluator = Evaluator()
330334
evaluator.output_evaluator.client = mock_openai_client
335+
evaluator.hallucination_evaluator.client = mock_openai_client
336+
evaluator.safety_evaluator.client = mock_openai_client
331337

332338
test_case = TestCase(
333339
name="test",
@@ -368,6 +374,8 @@ async def test_evaluate_score_rounding(self, mock_openai_client):
368374
"""Test that score is properly rounded to 2 decimal places."""
369375
evaluator = Evaluator()
370376
evaluator.output_evaluator.client = mock_openai_client
377+
evaluator.hallucination_evaluator.client = mock_openai_client
378+
evaluator.safety_evaluator.client = mock_openai_client
371379

372380
test_case = TestCase(
373381
name="test",
@@ -406,6 +414,8 @@ async def test_evaluate_with_no_thresholds(self, mock_openai_client):
406414
"""Test evaluation when cost/latency thresholds are not specified."""
407415
evaluator = Evaluator()
408416
evaluator.output_evaluator.client = mock_openai_client
417+
evaluator.hallucination_evaluator.client = mock_openai_client
418+
evaluator.safety_evaluator.client = mock_openai_client
409419

410420
test_case = TestCase(
411421
name="test",

0 commit comments

Comments
 (0)