Skip to content

Commit 40de0d3

Browse files
ChibionosChibi Vikramclaude
authored
feat: add evaluation-level expectedOutput to EvaluationItem (#1387)
Co-authored-by: Chibi Vikram <chibivikram@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c522b2e commit 40de0d3

File tree

9 files changed

+1107
-0
lines changed

9 files changed

+1107
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"version": "1.0",
3+
"id": "EvalLevelExpectedOutputLLMJudgeSet",
4+
"name": "Evaluation-Level expectedOutput with LLM Judge",
5+
"evaluatorRefs": [
6+
"ExactMatchEvaluator",
7+
"LLMJudgeOutputEvaluator"
8+
],
9+
"evaluations": [
10+
{
11+
"id": "eval-level-llm-judge-add",
12+
"name": "LLM Judge uses eval-level expectedOutput (addition)",
13+
"inputs": {
14+
"a": 3,
15+
"b": 7,
16+
"operator": "+"
17+
},
18+
"expectedOutput": {
19+
"result": 10.0
20+
},
21+
"expectedAgentBehavior": "The agent should correctly add the two numbers and return the result.",
22+
"evaluationCriterias": {
23+
"ExactMatchEvaluator": null,
24+
"LLMJudgeOutputEvaluator": null
25+
}
26+
},
27+
{
28+
"id": "eval-level-llm-judge-multiply",
29+
"name": "LLM Judge uses eval-level expectedOutput (multiplication)",
30+
"inputs": {
31+
"a": 6,
32+
"b": 8,
33+
"operator": "*"
34+
},
35+
"expectedOutput": {
36+
"result": 48.0
37+
},
38+
"expectedAgentBehavior": "The agent should correctly multiply the two numbers and return the result.",
39+
"evaluationCriterias": {
40+
"ExactMatchEvaluator": null,
41+
"LLMJudgeOutputEvaluator": null
42+
}
43+
}
44+
]
45+
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
{
2+
"version": "1.0",
3+
"id": "EvalLevelExpectedOutputSet",
4+
"name": "Evaluation-Level expectedOutput Tests",
5+
"evaluatorRefs": [
6+
"ExactMatchEvaluator",
7+
"JsonSimilarityEvaluator",
8+
"ContainsEvaluator"
9+
],
10+
"evaluations": [
11+
{
12+
"id": "eval-level-null-criteria-add",
13+
"name": "Eval-level expectedOutput with null criteria (addition)",
14+
"inputs": {
15+
"a": 2,
16+
"b": 3,
17+
"operator": "+"
18+
},
19+
"expectedOutput": {
20+
"result": 5.0
21+
},
22+
"evaluationCriterias": {
23+
"ExactMatchEvaluator": null,
24+
"JsonSimilarityEvaluator": null,
25+
"ContainsEvaluator": {
26+
"searchText": "5"
27+
}
28+
}
29+
},
30+
{
31+
"id": "eval-level-null-criteria-multiply",
32+
"name": "Eval-level expectedOutput with null criteria (multiplication)",
33+
"inputs": {
34+
"a": 4,
35+
"b": 5,
36+
"operator": "*"
37+
},
38+
"expectedOutput": {
39+
"result": 20.0
40+
},
41+
"evaluationCriterias": {
42+
"ExactMatchEvaluator": null,
43+
"JsonSimilarityEvaluator": null,
44+
"ContainsEvaluator": {
45+
"searchText": "20"
46+
}
47+
}
48+
},
49+
{
50+
"id": "eval-level-per-evaluator-override",
51+
"name": "Per-evaluator expectedOutput overrides eval-level",
52+
"inputs": {
53+
"a": 10,
54+
"b": 5,
55+
"operator": "-"
56+
},
57+
"expectedOutput": {
58+
"result": 5.0
59+
},
60+
"evaluationCriterias": {
61+
"ExactMatchEvaluator": {
62+
"expectedOutput": {
63+
"result": 5.0
64+
}
65+
},
66+
"JsonSimilarityEvaluator": {
67+
"expectedOutput": {
68+
"result": 5.0
69+
}
70+
},
71+
"ContainsEvaluator": {
72+
"searchText": "5"
73+
}
74+
}
75+
},
76+
{
77+
"id": "eval-level-mixed-null-and-explicit",
78+
"name": "Mixed: some evaluators null, some explicit",
79+
"inputs": {
80+
"a": 7,
81+
"b": 3,
82+
"operator": "+"
83+
},
84+
"expectedOutput": {
85+
"result": 10.0
86+
},
87+
"evaluationCriterias": {
88+
"ExactMatchEvaluator": null,
89+
"JsonSimilarityEvaluator": {
90+
"expectedOutput": {
91+
"result": 10.0
92+
}
93+
},
94+
"ContainsEvaluator": {
95+
"searchText": "10"
96+
}
97+
}
98+
}
99+
]
100+
}

src/uipath/eval/models/evaluation_set.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ class EvaluationItem(BaseModel):
8181
id: str
8282
name: str
8383
inputs: dict[str, Any]
84+
expected_output: dict[str, Any] | str | None = Field(
85+
default=None, alias="expectedOutput"
86+
)
8487
evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
8588
..., alias="evaluationCriterias"
8689
)

src/uipath/eval/runtime/runtime.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646

4747
from .._execution_context import ExecutionSpanCollector
4848
from ..evaluators.base_evaluator import GenericBaseEvaluator
49+
from ..evaluators.output_evaluator import OutputEvaluationCriteria
4950
from ..mocks._cache_manager import CacheManager
5051
from ..mocks._input_mocker import (
5152
generate_llm_input,
@@ -549,6 +550,22 @@ async def _execute_eval(
549550
continue
550551
evaluation_criteria = eval_item.evaluation_criterias[evaluator.id]
551552

553+
# Inject eval-level expectedOutput for output-based evaluators
554+
if eval_item.expected_output is not None and issubclass(
555+
evaluator.evaluation_criteria_type,
556+
OutputEvaluationCriteria,
557+
):
558+
if evaluation_criteria is None:
559+
evaluation_criteria = {
560+
"expectedOutput": eval_item.expected_output
561+
}
562+
elif "expectedOutput" not in evaluation_criteria:
563+
evaluation_criteria = {
564+
**evaluation_criteria,
565+
"expectedOutput": eval_item.expected_output,
566+
}
567+
# else: per-evaluator expectedOutput takes precedence
568+
552569
evaluation_result = await self.run_evaluator(
553570
evaluator=evaluator,
554571
execution_output=agent_execution_output,
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[project]
2+
name = "eval-level-expected-output"
3+
version = "0.0.1"
4+
description = "Tests for evaluation-level expectedOutput on EvaluationItem"
5+
authors = [{ name = "John Doe", email = "john.doe@myemail.com" }]
6+
dependencies = [
7+
"uipath",
8+
]
9+
requires-python = ">=3.11"
10+
11+
[tool.uv.sources]
12+
uipath = { path = "../../", editable = true }
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
set -e
3+
4+
echo "Syncing dependencies..."
5+
uv sync
6+
7+
echo "Authenticating with UiPath..."
8+
uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
9+
10+
echo "Running eval-level expectedOutput evaluations (deterministic evaluators)..."
11+
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output.json --no-report --output-file eval-level-expected-output.json
12+
13+
echo "Running eval-level expectedOutput evaluations (LLM judge)..."
14+
uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/eval-level-expected-output-llm-judge.json --no-report --output-file eval-level-expected-output-llm-judge.json
15+
16+
echo "Test completed successfully!"
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Assertions for eval-level-expected-output testcase.
2+
3+
Validates that evaluation-level expectedOutput is correctly injected
4+
into output-based evaluators (ExactMatch, JsonSimilarity, LLMJudge)
5+
while non-output evaluators (Contains) remain unaffected.
6+
"""
7+
8+
import json
9+
import os
10+
11+
# Evaluators expected in the deterministic eval set
12+
DETERMINISTIC_EVALUATORS = {
13+
"ExactMatchEvaluator",
14+
"JsonSimilarityEvaluator",
15+
"ContainsEvaluator",
16+
}
17+
18+
# Evaluators expected in the LLM judge eval set
19+
LLM_JUDGE_EVALUATORS = {
20+
"ExactMatchEvaluator",
21+
"LLMJudgeOutputEvaluator",
22+
}
23+
24+
# Evaluations in the deterministic eval set
25+
DETERMINISTIC_EVALUATIONS = {
26+
"Eval-level expectedOutput with null criteria (addition)",
27+
"Eval-level expectedOutput with null criteria (multiplication)",
28+
"Per-evaluator expectedOutput overrides eval-level",
29+
"Mixed: some evaluators null, some explicit",
30+
}
31+
32+
# Evaluations in the LLM judge eval set
33+
LLM_JUDGE_EVALUATIONS = {
34+
"LLM Judge uses eval-level expectedOutput (addition)",
35+
"LLM Judge uses eval-level expectedOutput (multiplication)",
36+
}
37+
38+
39+
def validate_output_file(
40+
output_file: str,
41+
expected_evaluations: set[str],
42+
expected_evaluators: set[str],
43+
min_score: float = 0.99,
44+
) -> None:
45+
"""Validate an evaluation output file.
46+
47+
Args:
48+
output_file: Path to the evaluation output JSON file.
49+
expected_evaluations: Set of evaluation names to expect.
50+
expected_evaluators: Set of evaluator IDs/names to expect.
51+
min_score: Minimum acceptable score for all evaluators.
52+
"""
53+
assert os.path.isfile(output_file), f"Output file '{output_file}' not found"
54+
print(f" Found output file: {output_file}")
55+
56+
with open(output_file, "r", encoding="utf-8") as f:
57+
output_data = json.load(f)
58+
59+
assert "evaluationSetResults" in output_data, "Missing 'evaluationSetResults'"
60+
61+
evaluation_results = output_data["evaluationSetResults"]
62+
assert len(evaluation_results) > 0, "No evaluation results found"
63+
print(f" Found {len(evaluation_results)} evaluation result(s)")
64+
65+
failed_count = 0
66+
seen_evaluations: set[str] = set()
67+
seen_evaluators: set[str] = set()
68+
69+
for eval_result in evaluation_results:
70+
eval_name = eval_result.get("evaluationName", "Unknown")
71+
seen_evaluations.add(eval_name)
72+
print(f"\n Validating: {eval_name}")
73+
74+
eval_run_results = eval_result.get("evaluationRunResults", [])
75+
assert len(eval_run_results) > 0, f"No run results for '{eval_name}'"
76+
77+
for eval_run in eval_run_results:
78+
evaluator_id = eval_run.get("evaluatorId", "Unknown")
79+
evaluator_name = eval_run.get("evaluatorName", evaluator_id)
80+
result = eval_run.get("result", {})
81+
score = result.get("score")
82+
83+
seen_evaluators.add(evaluator_id)
84+
85+
is_passing = False
86+
if score is True:
87+
is_passing = True
88+
elif isinstance(score, (int, float)) and score >= min_score:
89+
is_passing = True
90+
91+
if is_passing:
92+
display = f"{score:.2f}" if isinstance(score, float) else str(score)
93+
print(f" {evaluator_name}: score={display} (pass)")
94+
else:
95+
print(
96+
f" {evaluator_name}: score={score} "
97+
f"(FAILED - expected >= {min_score})"
98+
)
99+
failed_count += 1
100+
101+
# Verify all expected evaluations were seen
102+
missing_evals = expected_evaluations - seen_evaluations
103+
if missing_evals:
104+
print(f"\n Missing evaluations: {missing_evals}")
105+
failed_count += len(missing_evals)
106+
107+
# Verify all expected evaluators were seen
108+
missing_evaluators = expected_evaluators - seen_evaluators
109+
if missing_evaluators:
110+
print(f"\n Missing evaluators: {missing_evaluators}")
111+
failed_count += len(missing_evaluators)
112+
113+
print(f"\n{'=' * 60}")
114+
print(f" Failed: {failed_count}")
115+
print(f"{'=' * 60}")
116+
117+
assert failed_count == 0, f"{failed_count} assertion(s) failed for {output_file}"
118+
print(f"\n All assertions passed for {output_file}!")
119+
120+
121+
def main() -> None:
122+
"""Main assertion logic."""
123+
# 1. Validate deterministic evaluators (ExactMatch, JsonSimilarity, Contains)
124+
# All scores should be >= 0.99 since these are deterministic calculations
125+
print("\n--- Deterministic Evaluators ---")
126+
validate_output_file(
127+
"eval-level-expected-output.json",
128+
expected_evaluations=DETERMINISTIC_EVALUATIONS,
129+
expected_evaluators=DETERMINISTIC_EVALUATORS,
130+
min_score=0.99,
131+
)
132+
133+
# 2. Validate LLM judge evaluators
134+
# ExactMatch should score >= 0.99, LLM judge scores can vary
135+
# but should be > 0 (semantically correct answers)
136+
print("\n--- LLM Judge Evaluators ---")
137+
validate_output_file(
138+
"eval-level-expected-output-llm-judge.json",
139+
expected_evaluations=LLM_JUDGE_EVALUATIONS,
140+
expected_evaluators=LLM_JUDGE_EVALUATORS,
141+
min_score=0.5, # LLM judge scores can vary, but should be well above 0
142+
)
143+
144+
print("\n All eval-level expectedOutput assertions passed!")
145+
146+
147+
if __name__ == "__main__":
148+
main()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"functions": {
3+
"main": "../../samples/calculator/main.py:main"
4+
}
5+
}

0 commit comments

Comments
 (0)