Skip to content

Commit e23e2c7

Browse files
committed
feat: add eval cli command
1 parent c67c34b commit e23e2c7

File tree

8 files changed

+503
-2
lines changed

8 files changed

+503
-2
lines changed

src/uipath/_cli/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .cli_pull import pull as pull # type: ignore
1414
from .cli_push import push as push # type: ignore
1515
from .cli_run import run as run # type: ignore
16+
from .cli_eval import eval as eval # type: ignore
1617

1718

1819
def _get_safe_version() -> str:
@@ -67,3 +68,4 @@ def cli(lv: bool, v: bool) -> None:
6768
cli.add_command(invoke)
6869
cli.add_command(push)
6970
cli.add_command(pull)
71+
cli.add_command(eval)
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
import asyncio
2+
import json
3+
from datetime import UTC, datetime
4+
from pathlib import Path
5+
from typing import Any, Dict, List
6+
7+
from uipath._cli._utils._console import ConsoleLogger
8+
9+
from ..cli_run import run
10+
from .evaluators.llm_evaluator import LLMEvaluator
11+
from .models import EvaluationSetResult
12+
13+
console = ConsoleLogger()
14+
15+
16+
class EvaluationService:
17+
"""Service for running evaluations."""
18+
19+
def __init__(self, eval_set_path: str | Path):
20+
"""Initialize the evaluation service.
21+
22+
Args:
23+
eval_set_path: Path to the evaluation set file (can be string or Path)
24+
"""
25+
self.eval_set_path = Path(eval_set_path)
26+
self.eval_set = self._load_eval_set()
27+
self.evaluators = self._load_evaluators()
28+
self.num_workers = 8
29+
self.results_lock = asyncio.Lock()
30+
self._initialize_results()
31+
32+
def _initialize_results(self) -> None:
33+
"""Initialize the results file and directory."""
34+
# Create results directory if it doesn't exist
35+
results_dir = self.eval_set_path.parent.parent / "results"
36+
results_dir.mkdir(exist_ok=True)
37+
38+
# Create results file
39+
timestamp = datetime.now(UTC).strftime("%M-%H-%d-%m-%Y")
40+
eval_set_name = self.eval_set["name"]
41+
self.result_file = results_dir / f"eval-{eval_set_name}-{timestamp}.json"
42+
43+
# Initialize with empty results
44+
initial_results = EvaluationSetResult(
45+
eval_set_id=self.eval_set["id"],
46+
eval_set_name=self.eval_set["name"],
47+
results=[],
48+
average_score=0.0,
49+
)
50+
51+
with open(self.result_file, "w", encoding="utf-8") as f:
52+
f.write(initial_results.model_dump_json(indent=2))
53+
54+
def _load_eval_set(self) -> Dict[str, Any]:
55+
"""Load the evaluation set from file.
56+
57+
Returns:
58+
The loaded evaluation set
59+
"""
60+
with open(self.eval_set_path, "r", encoding="utf-8") as f:
61+
return json.load(f)
62+
63+
def _load_evaluators(self) -> List[LLMEvaluator]:
64+
"""Load evaluators referenced by the evaluation set."""
65+
evaluators = []
66+
evaluators_dir = self.eval_set_path.parent.parent / "evaluators"
67+
68+
for evaluator_id in self.eval_set["evaluatorRefs"]:
69+
# Find evaluator file
70+
evaluator_file = None
71+
for file in evaluators_dir.glob("*.json"):
72+
with open(file) as f:
73+
data = json.load(f)
74+
if data.get("id") == evaluator_id:
75+
evaluator_file = data
76+
break
77+
78+
if not evaluator_file:
79+
raise ValueError(f"Could not find evaluator with ID {evaluator_id}")
80+
81+
evaluators.append(LLMEvaluator(evaluator_file))
82+
83+
return evaluators
84+
85+
async def _write_results(self, results: List[Any]) -> None:
86+
"""Write evaluation results to file with async lock.
87+
88+
Args:
89+
results: List of evaluation results to write
90+
"""
91+
async with self.results_lock:
92+
# Read current results
93+
with open(self.result_file, "r", encoding="utf-8") as f:
94+
current_results = EvaluationSetResult.model_validate_json(f.read())
95+
96+
# Add new results
97+
current_results.results.extend(results)
98+
99+
if current_results.results:
100+
current_results.average_score = sum(
101+
r.score for r in current_results.results
102+
) / len(current_results.results)
103+
104+
# Write updated results
105+
with open(self.result_file, "w", encoding="utf-8") as f:
106+
f.write(current_results.model_dump_json(indent=2))
107+
108+
def _run_agent(self, input_json: str) -> Dict[str, Any]:
109+
"""Run the agent with the given input.
110+
111+
Args:
112+
input_json: JSON string containing input data
113+
114+
Returns:
115+
Agent output as dictionary
116+
"""
117+
try:
118+
# Run the agent using the CLI run command
119+
run.callback(
120+
entrypoint=None,
121+
input=input_json,
122+
resume=False,
123+
file=None,
124+
debug=False,
125+
debug_port=5678,
126+
)
127+
128+
# Read the output file
129+
output_file = Path("__uipath") / "output.json"
130+
with open(output_file, "r", encoding="utf-8") as f:
131+
result = json.load(f)
132+
133+
# Extract and parse the output content
134+
output_content = result.get("output", {})
135+
if isinstance(output_content, str):
136+
try:
137+
return json.loads(output_content)
138+
except json.JSONDecodeError as e:
139+
raise Exception(f"Error parsing output: {e}") from e
140+
return output_content
141+
142+
except Exception as e:
143+
console.error(f"Error running agent: {str(e)}")
144+
return {"error": str(e)}
145+
146+
async def _process_evaluation(self, eval_item: Dict[str, Any]) -> None:
147+
"""Process a single evaluation item.
148+
149+
Args:
150+
eval_item: The evaluation item to process
151+
"""
152+
console.info(f"Running evaluation: {eval_item['name']}")
153+
154+
# Run the agent using the evaluation input
155+
input_json = json.dumps(eval_item["inputs"])
156+
157+
# Run _run_agent in a non-async context using run_in_executor
158+
loop = asyncio.get_running_loop()
159+
actual_output = await loop.run_in_executor(None, self._run_agent, input_json)
160+
161+
# Run each evaluator
162+
eval_results = []
163+
for evaluator in self.evaluators:
164+
result = await evaluator.evaluate(
165+
evaluation_id=eval_item["id"],
166+
evaluation_name=eval_item["name"],
167+
input_data=eval_item["inputs"],
168+
expected_output=eval_item["expectedOutput"],
169+
actual_output=actual_output,
170+
)
171+
eval_results.append(result)
172+
173+
# Write results immediately
174+
await self._write_results(eval_results)
175+
176+
# TODO: here we should send the event to the SW eval API
177+
console.info(f"Evaluation {eval_item['name']} complete.")
178+
179+
async def _producer_task(self, task_queue: asyncio.Queue) -> None:
180+
"""Producer task that adds all evaluations to the queue.
181+
182+
Args:
183+
task_queue: The asyncio queue to add tasks to
184+
"""
185+
for eval_item in self.eval_set["evaluations"]:
186+
await task_queue.put(eval_item)
187+
188+
# Add sentinel values to signal workers to stop
189+
for _ in range(self.num_workers):
190+
await task_queue.put(None)
191+
192+
async def _consumer_task(self, task_queue: asyncio.Queue, worker_id: int) -> None:
193+
"""Consumer task that processes evaluations from the queue.
194+
195+
Args:
196+
task_queue: The asyncio queue to get tasks from
197+
worker_id: ID of this worker for logging
198+
"""
199+
while True:
200+
eval_item = await task_queue.get()
201+
if eval_item is None:
202+
# Sentinel value - worker should stop
203+
task_queue.task_done()
204+
return
205+
206+
try:
207+
await self._process_evaluation(eval_item)
208+
task_queue.task_done()
209+
except Exception as e:
210+
# Log error and continue to next item
211+
task_queue.task_done()
212+
console.warning(
213+
f"Worker {worker_id} failed evaluation {eval_item.get('name', 'Unknown')}: {str(e)}"
214+
)
215+
216+
async def run_evaluation(self) -> None:
217+
"""Run the evaluation set using multiple worker tasks."""
218+
task_queue = asyncio.Queue()
219+
220+
producer = asyncio.create_task(self._producer_task(task_queue))
221+
222+
consumers = []
223+
for worker_id in range(self.num_workers):
224+
consumer = asyncio.create_task(self._consumer_task(task_queue, worker_id))
225+
consumers.append(consumer)
226+
227+
await producer
228+
229+
await task_queue.join()
230+
231+
# Wait for all consumers to finish
232+
await asyncio.gather(*consumers)
233+
234+
console.success(
235+
f"All evaluations complete. Results saved to {self.result_file}"
236+
)
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import json
2+
from typing import Any, Dict, Optional
3+
4+
from ..._utils._debug import console
5+
from ...._config import Config
6+
from ...._execution_context import ExecutionContext
7+
from ...._services.llm_gateway_service import UiPathLlmChatService
8+
from ...._utils.constants import (
9+
ENV_BASE_URL,
10+
ENV_UIPATH_ACCESS_TOKEN,
11+
ENV_UNATTENDED_USER_ACCESS_TOKEN, COMMUNITY_AGENTS_SUFFIX,
12+
)
13+
from ..models import EvaluationResult, EvaluatorCategory, LLMResponse
14+
15+
16+
class LLMEvaluator:
17+
"""Service for evaluating outputs using LLM."""
18+
19+
# TODO: find a better way to structure the output
20+
format_instructions: dict[str, str] = {
21+
"role": "system",
22+
"content": 'Extract the data from the following text and model it like this in JSON format: {"similarity_score" = "", "score_justification" = "" . Similarity_score is a float between 0 and 100 and score_justification is a str. The output should be a plain json, nothing else. No markdown.',
23+
}
24+
25+
def __init__(self, evaluator_config: Dict[str, Any]):
26+
"""Initialize LLM evaluator.
27+
28+
Args:
29+
evaluator_config: Configuration for the evaluator from evaluator JSON file
30+
"""
31+
import os
32+
33+
self.config = evaluator_config
34+
base_url_value = os.getenv(ENV_BASE_URL)
35+
secret_value = os.getenv(ENV_UNATTENDED_USER_ACCESS_TOKEN) or os.getenv(
36+
ENV_UIPATH_ACCESS_TOKEN
37+
)
38+
config = Config(
39+
base_url=base_url_value, # type: ignore
40+
secret=secret_value, # type: ignore
41+
)
42+
self.llm = UiPathLlmChatService(config, ExecutionContext())
43+
44+
# Validate evaluator category
45+
if self.config.get("category") != EvaluatorCategory.LlmAsAJudge:
46+
raise ValueError("Evaluator must be of type LlmAsAJudge")
47+
48+
async def evaluate(
49+
self,
50+
evaluation_id: str,
51+
evaluation_name: str,
52+
input_data: Dict[str, Any],
53+
expected_output: Dict[str, Any],
54+
actual_output: Dict[str, Any],
55+
) -> EvaluationResult:
56+
"""Evaluate the actual output against expected output using LLM.
57+
58+
Args:
59+
evaluation_id: ID of the evaluation
60+
evaluation_name: Name of the evaluation
61+
input_data: Input data used for the evaluation
62+
expected_output: Expected output from the evaluation
63+
actual_output: Actual output received
64+
65+
Returns:
66+
EvaluationResult containing the evaluation score and details
67+
"""
68+
# Prepare the prompt by replacing placeholders
69+
prompt = self.config["prompt"]
70+
prompt = prompt.replace(
71+
"{{ExpectedOutput}}", json.dumps(expected_output, indent=2)
72+
)
73+
content = prompt.replace(
74+
"{{ActualOutput}}", json.dumps(actual_output, indent=2)
75+
)
76+
77+
model: Optional[str] = self.config.get("model", None)
78+
if not model:
79+
console.error("Evaluator model cannot be extracted")
80+
81+
# remove community-agents suffix from llm model name
82+
if model.endswith(COMMUNITY_AGENTS_SUFFIX):
83+
model = model.replace(COMMUNITY_AGENTS_SUFFIX, "")
84+
85+
response = await self.llm.chat_completions(
86+
messages=[{"role": "user", "content": content}], model=model
87+
)
88+
structured_response = await self.llm.chat_completions(
89+
messages=[
90+
self.format_instructions,
91+
{"role": "user", "content": response.choices[-1].message.content},
92+
],
93+
model=model,
94+
)
95+
try:
96+
llm_response = LLMResponse(
97+
**json.loads(structured_response.choices[-1].message.content)
98+
)
99+
except Exception as e:
100+
raise Exception(f"Error parsing LLM response: {e}") from e
101+
# Leave those comments
102+
# llm_response = LLMResponse(similarity_score=90, score_justification="test justification")
103+
score = llm_response.similarity_score
104+
details = llm_response.score_justification
105+
106+
if score < 0 or score > 100:
107+
raise ValueError(f"Score {score} is outside valid range 0-100")
108+
109+
return EvaluationResult(
110+
evaluation_id=evaluation_id,
111+
evaluation_name=evaluation_name,
112+
evaluator_id=self.config["id"],
113+
evaluator_name=self.config["name"],
114+
score=score,
115+
input=input_data,
116+
expected_output=expected_output,
117+
actual_output=actual_output,
118+
details=details,
119+
)

0 commit comments

Comments
 (0)