|
24 | 24 | if TYPE_CHECKING: |
25 | 25 | from evalview.core.types import EvaluationResult, TestCase |
26 | 26 | from evalview.core.config import EvalViewConfig |
27 | | - from evalview.core.golden import GoldenStore |
| 27 | + from evalview.core.diff import TraceDiff |
| 28 | + from evalview.core.golden import GoldenStore, GoldenTrace |
| 29 | + from evalview.core.drift_tracker import DriftTracker |
28 | 30 | from evalview.adapters.base import AgentAdapter |
29 | 31 |
|
30 | 32 | # Load environment variables (.env is the OSS standard, .env.local for overrides) |
@@ -234,6 +236,124 @@ async def _run_one_test() -> Any: |
234 | 236 | return results |
235 | 237 |
|
236 | 238 |
|
| 239 | +def _execute_check_tests( |
| 240 | + test_cases: List["TestCase"], |
| 241 | + config: Optional["EvalViewConfig"], |
| 242 | + json_output: bool, |
| 243 | + semantic_diff: bool = False, |
| 244 | + timeout: float = 30.0, |
| 245 | +) -> Tuple[List[Tuple[str, "TraceDiff"]], List["EvaluationResult"], "DriftTracker", Dict[str, "GoldenTrace"]]: |
| 246 | + """Execute tests and compare against golden variants. |
| 247 | +
|
| 248 | + Args: |
| 249 | + test_cases: Test cases to run. |
| 250 | + config: EvalView config (adapter, endpoint, thresholds). |
| 251 | + json_output: Suppress non-JSON console output when True. |
| 252 | + semantic_diff: Enable embedding-based semantic similarity (opt-in). |
| 253 | +
|
| 254 | + Returns: |
| 255 | + Tuple of (diffs, results, drift_tracker, golden_traces) where |
| 256 | + diffs is [(test_name, TraceDiff)] and golden_traces maps test name |
| 257 | + to the primary GoldenTrace used for comparison. |
| 258 | + """ |
| 259 | + from evalview.core.golden import GoldenStore, GoldenTrace |
| 260 | + from evalview.core.diff import DiffEngine |
| 261 | + from evalview.core.config import DiffConfig |
| 262 | + from evalview.core.drift_tracker import DriftTracker |
| 263 | + from evalview.evaluators.evaluator import Evaluator |
| 264 | + |
| 265 | + diff_config = config.get_diff_config() if config else DiffConfig() |
| 266 | + # --semantic-diff flag overrides config file setting |
| 267 | + if semantic_diff: |
| 268 | + diff_config = DiffConfig( |
| 269 | + **{**diff_config.model_dump(), "semantic_diff_enabled": True} |
| 270 | + ) |
| 271 | + |
| 272 | + store = GoldenStore() |
| 273 | + diff_engine = DiffEngine(config=diff_config) |
| 274 | + drift_tracker = DriftTracker() |
| 275 | + evaluator = Evaluator() |
| 276 | + |
| 277 | + results: List["EvaluationResult"] = [] |
| 278 | + diffs: List[Tuple[str, "TraceDiff"]] = [] |
| 279 | + golden_traces: Dict[str, GoldenTrace] = {} |
| 280 | + |
| 281 | + async def _run_one(tc) -> Optional[Tuple["EvaluationResult", "TraceDiff", GoldenTrace]]: |
| 282 | + """Run a single test: execute -> evaluate -> diff (async pipeline).""" |
| 283 | + adapter_type = tc.adapter or (config.adapter if config else None) |
| 284 | + endpoint = tc.endpoint or (config.endpoint if config else None) |
| 285 | + if not adapter_type or not endpoint: |
| 286 | + return None |
| 287 | + |
| 288 | + allow_private = getattr(config, "allow_private_urls", True) if config else True |
| 289 | + try: |
| 290 | + adapter = _create_adapter(adapter_type, endpoint, timeout=timeout, allow_private_urls=allow_private) |
| 291 | + except ValueError as e: |
| 292 | + if not json_output: |
| 293 | + console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]") |
| 294 | + return None |
| 295 | + |
| 296 | + trace = await adapter.execute(tc.input.query, tc.input.context) |
| 297 | + result = await evaluator.evaluate(tc, trace) |
| 298 | + |
| 299 | + golden_variants = store.load_all_golden_variants(tc.name) |
| 300 | + if not golden_variants: |
| 301 | + return None |
| 302 | + |
| 303 | + # Use async comparison to include semantic diff when enabled |
| 304 | + diff = await diff_engine.compare_multi_reference_async( |
| 305 | + golden_variants, trace, result.score |
| 306 | + ) |
| 307 | + return result, diff, golden_variants[0] |
| 308 | + |
| 309 | + # Run all tests concurrently in a single event loop. |
| 310 | + # return_exceptions=True means exceptions are returned as values (not raised), |
| 311 | + # so one failing test does not cancel the others. |
| 312 | + async def _run_all() -> List: |
| 313 | + return await asyncio.gather(*[_run_one(tc) for tc in test_cases], return_exceptions=True) |
| 314 | + |
| 315 | + outcomes = asyncio.run(_run_all()) |
| 316 | + |
| 317 | + for tc, outcome in zip(test_cases, outcomes): |
| 318 | + if isinstance(outcome, BaseException): |
| 319 | + if not json_output: |
| 320 | + if isinstance(outcome, (asyncio.TimeoutError, asyncio.CancelledError)): |
| 321 | + console.print(f"[red]✗ {tc.name}: Async execution timed out — {outcome}[/red]") |
| 322 | + else: |
| 323 | + console.print(f"[red]✗ {tc.name}: Failed — {outcome}[/red]") |
| 324 | + continue |
| 325 | + if outcome is None: |
| 326 | + continue |
| 327 | + result, diff, golden = outcome |
| 328 | + results.append(result) |
| 329 | + diffs.append((tc.name, diff)) |
| 330 | + golden_traces[tc.name] = golden |
| 331 | + drift_tracker.record_check(tc.name, diff) |
| 332 | + |
| 333 | + return diffs, results, drift_tracker, golden_traces |
| 334 | + |
| 335 | + |
| 336 | +def _analyze_check_diffs(diffs: List[Tuple[str, "TraceDiff"]]) -> Dict[str, Any]: |
| 337 | + """Analyze diffs and return summary statistics. |
| 338 | +
|
| 339 | + Returns: |
| 340 | + Dict with keys: has_regressions, has_tools_changed, has_output_changed, all_passed |
| 341 | + """ |
| 342 | + from evalview.core.diff import DiffStatus |
| 343 | + |
| 344 | + has_regressions = any(d.overall_severity == DiffStatus.REGRESSION for _, d in diffs) |
| 345 | + has_tools_changed = any(d.overall_severity == DiffStatus.TOOLS_CHANGED for _, d in diffs) |
| 346 | + has_output_changed = any(d.overall_severity == DiffStatus.OUTPUT_CHANGED for _, d in diffs) |
| 347 | + all_passed = not has_regressions and not has_tools_changed and not has_output_changed |
| 348 | + |
| 349 | + return { |
| 350 | + "has_regressions": has_regressions, |
| 351 | + "has_tools_changed": has_tools_changed, |
| 352 | + "has_output_changed": has_output_changed, |
| 353 | + "all_passed": all_passed, |
| 354 | + } |
| 355 | + |
| 356 | + |
237 | 357 | def _cloud_push(saved_test_names: List[str]) -> None: |
238 | 358 | """Upload golden baselines for the given tests. Silently skips on error.""" |
239 | 359 | from evalview.cloud.auth import CloudAuth |
|
0 commit comments