|
| 1 | +"""Assertions for eval-level-expected-output testcase. |
| 2 | +
|
| 3 | +Validates that evaluation-level expectedOutput is correctly injected |
| 4 | +into output-based evaluators (ExactMatch, JsonSimilarity, LLMJudge) |
| 5 | +while non-output evaluators (Contains) remain unaffected. |
| 6 | +""" |
| 7 | + |
| 8 | +import json |
| 9 | +import os |
| 10 | + |
| 11 | +# Evaluators expected in the deterministic eval set |
| 12 | +DETERMINISTIC_EVALUATORS = { |
| 13 | + "ExactMatchEvaluator", |
| 14 | + "JsonSimilarityEvaluator", |
| 15 | + "ContainsEvaluator", |
| 16 | +} |
| 17 | + |
| 18 | +# Evaluators expected in the LLM judge eval set |
| 19 | +LLM_JUDGE_EVALUATORS = { |
| 20 | + "ExactMatchEvaluator", |
| 21 | + "LLMJudgeOutputEvaluator", |
| 22 | +} |
| 23 | + |
| 24 | +# Evaluations in the deterministic eval set |
| 25 | +DETERMINISTIC_EVALUATIONS = { |
| 26 | + "Eval-level expectedOutput with null criteria (addition)", |
| 27 | + "Eval-level expectedOutput with null criteria (multiplication)", |
| 28 | + "Per-evaluator expectedOutput overrides eval-level", |
| 29 | + "Mixed: some evaluators null, some explicit", |
| 30 | +} |
| 31 | + |
| 32 | +# Evaluations in the LLM judge eval set |
| 33 | +LLM_JUDGE_EVALUATIONS = { |
| 34 | + "LLM Judge uses eval-level expectedOutput (addition)", |
| 35 | + "LLM Judge uses eval-level expectedOutput (multiplication)", |
| 36 | +} |
| 37 | + |
| 38 | + |
| 39 | +def validate_output_file( |
| 40 | + output_file: str, |
| 41 | + expected_evaluations: set[str], |
| 42 | + expected_evaluators: set[str], |
| 43 | + min_score: float = 0.99, |
| 44 | +) -> None: |
| 45 | + """Validate an evaluation output file. |
| 46 | +
|
| 47 | + Args: |
| 48 | + output_file: Path to the evaluation output JSON file. |
| 49 | + expected_evaluations: Set of evaluation names to expect. |
| 50 | + expected_evaluators: Set of evaluator IDs/names to expect. |
| 51 | + min_score: Minimum acceptable score for all evaluators. |
| 52 | + """ |
| 53 | + assert os.path.isfile(output_file), f"Output file '{output_file}' not found" |
| 54 | + print(f" Found output file: {output_file}") |
| 55 | + |
| 56 | + with open(output_file, "r", encoding="utf-8") as f: |
| 57 | + output_data = json.load(f) |
| 58 | + |
| 59 | + assert "evaluationSetResults" in output_data, "Missing 'evaluationSetResults'" |
| 60 | + |
| 61 | + evaluation_results = output_data["evaluationSetResults"] |
| 62 | + assert len(evaluation_results) > 0, "No evaluation results found" |
| 63 | + print(f" Found {len(evaluation_results)} evaluation result(s)") |
| 64 | + |
| 65 | + failed_count = 0 |
| 66 | + seen_evaluations: set[str] = set() |
| 67 | + seen_evaluators: set[str] = set() |
| 68 | + |
| 69 | + for eval_result in evaluation_results: |
| 70 | + eval_name = eval_result.get("evaluationName", "Unknown") |
| 71 | + seen_evaluations.add(eval_name) |
| 72 | + print(f"\n Validating: {eval_name}") |
| 73 | + |
| 74 | + eval_run_results = eval_result.get("evaluationRunResults", []) |
| 75 | + assert len(eval_run_results) > 0, f"No run results for '{eval_name}'" |
| 76 | + |
| 77 | + for eval_run in eval_run_results: |
| 78 | + evaluator_id = eval_run.get("evaluatorId", "Unknown") |
| 79 | + evaluator_name = eval_run.get("evaluatorName", evaluator_id) |
| 80 | + result = eval_run.get("result", {}) |
| 81 | + score = result.get("score") |
| 82 | + |
| 83 | + seen_evaluators.add(evaluator_id) |
| 84 | + |
| 85 | + is_passing = False |
| 86 | + if score is True: |
| 87 | + is_passing = True |
| 88 | + elif isinstance(score, (int, float)) and score >= min_score: |
| 89 | + is_passing = True |
| 90 | + |
| 91 | + if is_passing: |
| 92 | + display = f"{score:.2f}" if isinstance(score, float) else str(score) |
| 93 | + print(f" {evaluator_name}: score={display} (pass)") |
| 94 | + else: |
| 95 | + print( |
| 96 | + f" {evaluator_name}: score={score} " |
| 97 | + f"(FAILED - expected >= {min_score})" |
| 98 | + ) |
| 99 | + failed_count += 1 |
| 100 | + |
| 101 | + # Verify all expected evaluations were seen |
| 102 | + missing_evals = expected_evaluations - seen_evaluations |
| 103 | + if missing_evals: |
| 104 | + print(f"\n Missing evaluations: {missing_evals}") |
| 105 | + failed_count += len(missing_evals) |
| 106 | + |
| 107 | + # Verify all expected evaluators were seen |
| 108 | + missing_evaluators = expected_evaluators - seen_evaluators |
| 109 | + if missing_evaluators: |
| 110 | + print(f"\n Missing evaluators: {missing_evaluators}") |
| 111 | + failed_count += len(missing_evaluators) |
| 112 | + |
| 113 | + print(f"\n{'=' * 60}") |
| 114 | + print(f" Failed: {failed_count}") |
| 115 | + print(f"{'=' * 60}") |
| 116 | + |
| 117 | + assert failed_count == 0, f"{failed_count} assertion(s) failed for {output_file}" |
| 118 | + print(f"\n All assertions passed for {output_file}!") |
| 119 | + |
| 120 | + |
| 121 | +def main() -> None: |
| 122 | + """Main assertion logic.""" |
| 123 | + # 1. Validate deterministic evaluators (ExactMatch, JsonSimilarity, Contains) |
| 124 | + # All scores should be >= 0.99 since these are deterministic calculations |
| 125 | + print("\n--- Deterministic Evaluators ---") |
| 126 | + validate_output_file( |
| 127 | + "eval-level-expected-output.json", |
| 128 | + expected_evaluations=DETERMINISTIC_EVALUATIONS, |
| 129 | + expected_evaluators=DETERMINISTIC_EVALUATORS, |
| 130 | + min_score=0.99, |
| 131 | + ) |
| 132 | + |
| 133 | + # 2. Validate LLM judge evaluators |
| 134 | + # ExactMatch should score >= 0.99, LLM judge scores can vary |
| 135 | + # but should be > 0 (semantically correct answers) |
| 136 | + print("\n--- LLM Judge Evaluators ---") |
| 137 | + validate_output_file( |
| 138 | + "eval-level-expected-output-llm-judge.json", |
| 139 | + expected_evaluations=LLM_JUDGE_EVALUATIONS, |
| 140 | + expected_evaluators=LLM_JUDGE_EVALUATORS, |
| 141 | + min_score=0.5, # LLM judge scores can vary, but should be well above 0 |
| 142 | + ) |
| 143 | + |
| 144 | + print("\n All eval-level expectedOutput assertions passed!") |
| 145 | + |
| 146 | + |
| 147 | +if __name__ == "__main__": |
| 148 | + main() |
0 commit comments