Skip to content

Commit 5f2326f

Browse files
committed
refactor: split god files and add comprehensive test coverage
- Split check_cmd.py (938→385 lines) into check_display.py (display), shared.py (execution logic), and slim check_cmd.py (commands only) - Extract llm_configs.py from llm_provider.py (data declarations vs client logic), with re-exports for backward compatibility - Add root cause attribution module for regression analysis - Add 109 new tests: evaluator orchestrator (29), config models (34), security/SSRF (46) - Update all downstream imports in monitor_cmd, test_check_pipeline, test_monitor
1 parent 86b6500 commit 5f2326f

14 files changed

+2968
-743
lines changed

evalview/commands/check_cmd.py

Lines changed: 23 additions & 515 deletions
Large diffs are not rendered by default.

evalview/commands/check_display.py

Lines changed: 412 additions & 0 deletions
Large diffs are not rendered by default.

evalview/commands/monitor_cmd.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313

1414
import click
1515

16-
from evalview.commands.shared import console, _load_config_if_exists, _parse_fail_statuses
17-
from evalview.commands.check_cmd import (
16+
from evalview.commands.shared import (
17+
console,
18+
_load_config_if_exists,
19+
_parse_fail_statuses,
1820
_execute_check_tests,
1921
_analyze_check_diffs,
2022
)

evalview/commands/shared.py

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
if TYPE_CHECKING:
2525
from evalview.core.types import EvaluationResult, TestCase
2626
from evalview.core.config import EvalViewConfig
27-
from evalview.core.golden import GoldenStore
27+
from evalview.core.diff import TraceDiff
28+
from evalview.core.golden import GoldenStore, GoldenTrace
29+
from evalview.core.drift_tracker import DriftTracker
2830
from evalview.adapters.base import AgentAdapter
2931

3032
# Load environment variables (.env is the OSS standard, .env.local for overrides)
@@ -234,6 +236,124 @@ async def _run_one_test() -> Any:
234236
return results
235237

236238

239+
def _execute_check_tests(
240+
test_cases: List["TestCase"],
241+
config: Optional["EvalViewConfig"],
242+
json_output: bool,
243+
semantic_diff: bool = False,
244+
timeout: float = 30.0,
245+
) -> Tuple[List[Tuple[str, "TraceDiff"]], List["EvaluationResult"], "DriftTracker", Dict[str, "GoldenTrace"]]:
246+
"""Execute tests and compare against golden variants.
247+
248+
Args:
249+
test_cases: Test cases to run.
250+
config: EvalView config (adapter, endpoint, thresholds).
251+
json_output: Suppress non-JSON console output when True.
252+
semantic_diff: Enable embedding-based semantic similarity (opt-in).
253+
254+
Returns:
255+
Tuple of (diffs, results, drift_tracker, golden_traces) where
256+
diffs is [(test_name, TraceDiff)] and golden_traces maps test name
257+
to the primary GoldenTrace used for comparison.
258+
"""
259+
from evalview.core.golden import GoldenStore, GoldenTrace
260+
from evalview.core.diff import DiffEngine
261+
from evalview.core.config import DiffConfig
262+
from evalview.core.drift_tracker import DriftTracker
263+
from evalview.evaluators.evaluator import Evaluator
264+
265+
diff_config = config.get_diff_config() if config else DiffConfig()
266+
# --semantic-diff flag overrides config file setting
267+
if semantic_diff:
268+
diff_config = DiffConfig(
269+
**{**diff_config.model_dump(), "semantic_diff_enabled": True}
270+
)
271+
272+
store = GoldenStore()
273+
diff_engine = DiffEngine(config=diff_config)
274+
drift_tracker = DriftTracker()
275+
evaluator = Evaluator()
276+
277+
results: List["EvaluationResult"] = []
278+
diffs: List[Tuple[str, "TraceDiff"]] = []
279+
golden_traces: Dict[str, GoldenTrace] = {}
280+
281+
async def _run_one(tc) -> Optional[Tuple["EvaluationResult", "TraceDiff", GoldenTrace]]:
282+
"""Run a single test: execute -> evaluate -> diff (async pipeline)."""
283+
adapter_type = tc.adapter or (config.adapter if config else None)
284+
endpoint = tc.endpoint or (config.endpoint if config else None)
285+
if not adapter_type or not endpoint:
286+
return None
287+
288+
allow_private = getattr(config, "allow_private_urls", True) if config else True
289+
try:
290+
adapter = _create_adapter(adapter_type, endpoint, timeout=timeout, allow_private_urls=allow_private)
291+
except ValueError as e:
292+
if not json_output:
293+
console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]")
294+
return None
295+
296+
trace = await adapter.execute(tc.input.query, tc.input.context)
297+
result = await evaluator.evaluate(tc, trace)
298+
299+
golden_variants = store.load_all_golden_variants(tc.name)
300+
if not golden_variants:
301+
return None
302+
303+
# Use async comparison to include semantic diff when enabled
304+
diff = await diff_engine.compare_multi_reference_async(
305+
golden_variants, trace, result.score
306+
)
307+
return result, diff, golden_variants[0]
308+
309+
# Run all tests concurrently in a single event loop.
310+
# return_exceptions=True means exceptions are returned as values (not raised),
311+
# so one failing test does not cancel the others.
312+
async def _run_all() -> List:
313+
return await asyncio.gather(*[_run_one(tc) for tc in test_cases], return_exceptions=True)
314+
315+
outcomes = asyncio.run(_run_all())
316+
317+
for tc, outcome in zip(test_cases, outcomes):
318+
if isinstance(outcome, BaseException):
319+
if not json_output:
320+
if isinstance(outcome, (asyncio.TimeoutError, asyncio.CancelledError)):
321+
console.print(f"[red]✗ {tc.name}: Async execution timed out — {outcome}[/red]")
322+
else:
323+
console.print(f"[red]✗ {tc.name}: Failed — {outcome}[/red]")
324+
continue
325+
if outcome is None:
326+
continue
327+
result, diff, golden = outcome
328+
results.append(result)
329+
diffs.append((tc.name, diff))
330+
golden_traces[tc.name] = golden
331+
drift_tracker.record_check(tc.name, diff)
332+
333+
return diffs, results, drift_tracker, golden_traces
334+
335+
336+
def _analyze_check_diffs(diffs: List[Tuple[str, "TraceDiff"]]) -> Dict[str, Any]:
337+
"""Analyze diffs and return summary statistics.
338+
339+
Returns:
340+
Dict with keys: has_regressions, has_tools_changed, has_output_changed, all_passed
341+
"""
342+
from evalview.core.diff import DiffStatus
343+
344+
has_regressions = any(d.overall_severity == DiffStatus.REGRESSION for _, d in diffs)
345+
has_tools_changed = any(d.overall_severity == DiffStatus.TOOLS_CHANGED for _, d in diffs)
346+
has_output_changed = any(d.overall_severity == DiffStatus.OUTPUT_CHANGED for _, d in diffs)
347+
all_passed = not has_regressions and not has_tools_changed and not has_output_changed
348+
349+
return {
350+
"has_regressions": has_regressions,
351+
"has_tools_changed": has_tools_changed,
352+
"has_output_changed": has_output_changed,
353+
"all_passed": all_passed,
354+
}
355+
356+
237357
def _cloud_push(saved_test_names: List[str]) -> None:
238358
"""Upload golden baselines for the given tests. Silently skips on error."""
239359
from evalview.cloud.auth import CloudAuth

0 commit comments

Comments
 (0)