diff --git a/pyproject.toml b/pyproject.toml index bf148234f..21f8f967a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,10 @@ no_implicit_reexport = true disallow_untyped_defs = false +[[tool.mypy.overrides]] +module = "uipath._cli._interactive.*" +disable_error_code = ["misc", "unused-ignore"] + [tool.pydantic-mypy] init_forbid_extra = true init_typed = true diff --git a/samples/calculator/evaluationSets/comprehensive_calculator_tests.json b/samples/calculator/evaluationSets/comprehensive_calculator_tests.json new file mode 100644 index 000000000..f8c941cb2 --- /dev/null +++ b/samples/calculator/evaluationSets/comprehensive_calculator_tests.json @@ -0,0 +1,118 @@ +{ + "id": "calc-comprehensive-001", + "fileName": "comprehensive_eval_set.json", + "evaluatorRefs": ["exact-match-eval", "json-similarity-eval"], + "name": "Comprehensive Calculator Tests", + "batchSize": 10, + "timeoutMinutes": 10, + "modelSettings": [], + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z", + "evaluations": [ + { + "id": "add-basic", + "name": "Basic Addition", + "inputs": { + "a": 5, + "b": 3, + "operator": "+" + }, + "expectedOutput": { + "result": 8.0 + }, + "expectedAgentBehavior": "Add two positive numbers", + "simulationInstructions": "", + "simulateInput": false, + "inputGenerationInstructions": "", + "simulateTools": false, + "toolsToSimulate": [], + "evalSetId": "calc-comprehensive-001", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" + }, + { + "id": "sub-basic", + "name": "Basic Subtraction", + "inputs": { + "a": 10, + "b": 4, + "operator": "-" + }, + "expectedOutput": { + "result": 6.0 + }, + "expectedAgentBehavior": "Subtract smaller from larger", + "simulationInstructions": "", + "simulateInput": false, + "inputGenerationInstructions": "", + "simulateTools": false, + "toolsToSimulate": [], + "evalSetId": "calc-comprehensive-001", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" + }, + { + "id": "mul-basic", + "name": "Basic Multiplication", + "inputs": { + "a": 7, + "b": 6, + "operator": "*" + }, + "expectedOutput": { + "result": 42.0 + }, + "expectedAgentBehavior": "Multiply two integers", + "simulationInstructions": "", + "simulateInput": false, + "inputGenerationInstructions": "", + "simulateTools": false, + "toolsToSimulate": [], + "evalSetId": "calc-comprehensive-001", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" + }, + { + "id": "div-basic", + "name": "Basic Division", + "inputs": { + "a": 15, + "b": 3, + "operator": "/" + }, + "expectedOutput": { + "result": 5.0 + }, + "expectedAgentBehavior": "Divide evenly", + "simulationInstructions": "", + "simulateInput": false, + "inputGenerationInstructions": "", + "simulateTools": false, + "toolsToSimulate": [], + "evalSetId": "calc-comprehensive-001", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" + }, + { + "id": "div-zero", + "name": "Division by Zero", + "inputs": { + "a": 10, + "b": 0, + "operator": "/" + }, + "expectedOutput": { + "result": 0.0 + }, + "expectedAgentBehavior": "Handle division by zero", + "simulationInstructions": "", + "simulateInput": false, + "inputGenerationInstructions": "", + "simulateTools": false, + "toolsToSimulate": [], + "evalSetId": "calc-comprehensive-001", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" + } + ] +} \ No newline at end of file diff --git a/samples/calculator/evaluators/exact_match.json b/samples/calculator/evaluators/exact_match.json new file mode 100644 index 000000000..4750fc819 --- /dev/null +++ b/samples/calculator/evaluators/exact_match.json @@ -0,0 +1,10 @@ +{ + "id": "exact-match-eval", + "name": "Exact Match Evaluator", + "description": "Tests for exact output matches", + "category": 0, + "type": 1, + "targetOutputKey": "*", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" +} diff --git a/samples/calculator/evaluators/json_similarity.json b/samples/calculator/evaluators/json_similarity.json new file mode 100644 index 000000000..b1fac450e --- /dev/null +++ b/samples/calculator/evaluators/json_similarity.json @@ -0,0 +1,10 @@ +{ + "id": "json-similarity-eval", + "name": "JSON Similarity Evaluator", + "description": "Tests for structural JSON similarity with tolerance", + "category": 0, + "type": 6, + "targetOutputKey": "*", + "createdAt": "2025-01-25T00:00:00Z", + "updatedAt": "2025-01-25T00:00:00Z" +} diff --git a/src/uipath/_cli/_interactive/__init__.py b/src/uipath/_cli/_interactive/__init__.py new file mode 100644 index 000000000..3fe5a81ab --- /dev/null +++ b/src/uipath/_cli/_interactive/__init__.py @@ -0,0 +1,5 @@ +"""Interactive evaluation CLI module.""" + +from ._main import launch_interactive_cli + +__all__ = ["launch_interactive_cli"] diff --git a/src/uipath/_cli/_interactive/_discovery.py b/src/uipath/_cli/_interactive/_discovery.py new file mode 100644 index 000000000..08ea55d84 --- /dev/null +++ b/src/uipath/_cli/_interactive/_discovery.py @@ -0,0 +1,48 @@ +"""Discovery utilities for finding eval sets and evaluators.""" +# type: ignore + +import json +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ._main import InteractiveEvalCLI + + +class DiscoveryMixin: + """Mixin for file discovery operations.""" + + def _discover_files(self: "InteractiveEvalCLI") -> None: + """Quickly discover eval sets and evaluators.""" + # Clear existing lists to avoid duplicates + self.eval_sets.clear() + self.evaluators.clear() + + # Find eval sets from evaluationSets folder + eval_sets_dir = self.project_root / "evaluationSets" + if eval_sets_dir.exists(): + for eval_file in eval_sets_dir.glob("*.json"): + try: + with open(eval_file) as f: + data = json.load(f) + # Check if it's an eval set by presence of "evaluations" array + if "evaluations" in data and isinstance( + data.get("evaluations"), list + ): + name = data.get("name", eval_file.stem) + self.eval_sets.append((name, eval_file)) + except Exception: + pass + + # Find evaluators from evaluators folder + evaluators_dir = self.project_root / "evaluators" + if evaluators_dir.exists(): + for eval_file in evaluators_dir.glob("*.json"): + try: + with open(eval_file) as f: + data = json.load(f) + # Verify it has evaluator-specific fields + if "id" in data and "type" in data: + name = data.get("name", eval_file.stem) + self.evaluators.append((name, eval_file)) + except Exception: + pass diff --git a/src/uipath/_cli/_interactive/_drill_down.py b/src/uipath/_cli/_interactive/_drill_down.py new file mode 100644 index 000000000..a200054b0 --- /dev/null +++ b/src/uipath/_cli/_interactive/_drill_down.py @@ -0,0 +1,92 @@ +"""Drill-down navigation for eval sets and evaluators.""" +# type: ignore + +from typing import TYPE_CHECKING + +from .._utils._console import ConsoleLogger + +if TYPE_CHECKING: + from ._main import InteractiveEvalCLI + +console = ConsoleLogger() + + +class DrillDownMixin: + """Mixin for drill-down navigation operations.""" + + def _drill_down_eval_sets(self: "InteractiveEvalCLI") -> None: + """Drill down into eval sets with navigation.""" + if not self.eval_sets: + self._show_no_items_screen("eval sets") + return + + current_selection = 0 + while True: + self._clear_screen() + console.info("šŸ“‹ Eval Sets - Navigate & Select") + console.info( + "āŒØļø Navigation: ↑↓ to navigate, Enter for details, q/Backspace to go back" + ) + console.info("─" * 65) + + for i, (name, path) in enumerate(self.eval_sets): + if i == current_selection: + console.info(f"ā–ŗ {i + 1}. {name} ā—„") + self._show_eval_set_preview(path) + else: + console.info(f" {i + 1}. {name}") + + key = self._get_key_input() + + if key in ["q", "Q", "back"]: + break + elif key == "up": + current_selection = (current_selection - 1) % len(self.eval_sets) + elif key == "down": + current_selection = (current_selection + 1) % len(self.eval_sets) + elif key in ["enter", " "]: + self._show_eval_set_details(self.eval_sets[current_selection]) + elif key.isdigit() and 1 <= int(key) <= len(self.eval_sets): + current_selection = int(key) - 1 + + def _drill_down_evaluators(self: "InteractiveEvalCLI") -> None: + """Drill down into evaluators with navigation.""" + if not self.evaluators: + self._show_no_items_screen("evaluators") + return + + current_selection = 0 + while True: + self._clear_screen() + console.info("āš™ļø Evaluators - Navigate & Select") + console.info( + "āŒØļø Navigation: ↑↓ to navigate, Enter for details, q/Backspace to go back" + ) + console.info("─" * 65) + + for i, (name, path) in enumerate(self.evaluators): + if i == current_selection: + console.info(f"ā–ŗ {i + 1}. {name} ā—„") + self._show_evaluator_preview(path) + else: + console.info(f" {i + 1}. {name}") + + key = self._get_key_input() + + if key in ["q", "Q", "back"]: + break + elif key == "up": + current_selection = (current_selection - 1) % len(self.evaluators) + elif key == "down": + current_selection = (current_selection + 1) % len(self.evaluators) + elif key in ["enter", " "]: + self._show_evaluator_details(self.evaluators[current_selection]) + elif key.isdigit() and 1 <= int(key) <= len(self.evaluators): + current_selection = int(key) - 1 + + def _show_no_items_screen(self: "InteractiveEvalCLI", item_type: str) -> None: + """Show no items screen.""" + self._clear_screen() + console.warning(f"No {item_type} found!") + console.info("Press Enter to go back...") + self._get_input("") diff --git a/src/uipath/_cli/_interactive/_eval_sets.py b/src/uipath/_cli/_interactive/_eval_sets.py new file mode 100644 index 000000000..2ac1da8df --- /dev/null +++ b/src/uipath/_cli/_interactive/_eval_sets.py @@ -0,0 +1,347 @@ +"""Eval set operations for interactive CLI.""" +# type: ignore + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List + +from .._utils._console import ConsoleLogger + +if TYPE_CHECKING: + from ._main import InteractiveEvalCLI + +console = ConsoleLogger() + + +class EvalSetMixin: + """Mixin for eval set operations.""" + + def _create_eval_set_simple(self: "InteractiveEvalCLI") -> None: + """Create new evaluation set - simplified version.""" + self._clear_screen() + console.info("āž• Create New Eval Set") + console.info("─" * 65) + + name = self._get_input("Name: ") + if not name: + return + + # Create clean filename from name + filename = f"{name.lower().replace(' ', '_')}.json" + + # Create basic eval set + eval_set = { + "id": f"eval-{len(self.eval_sets) + 1}", + "fileName": filename, + "evaluatorRefs": [], + "name": name, + "batchSize": 10, + "timeoutMinutes": 20, + "modelSettings": [], + "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "evaluations": [], + } + + # Ask if they want to add evaluations + add_evals = self._get_input("Add evaluations now? (y/n): ").lower() + if add_evals in ["y", "yes"]: + eval_set["evaluations"] = self._add_evaluations_interactive( + str(eval_set["id"]) + ) + + # Ensure evaluationSets directory exists + eval_sets_dir = self.project_root / "evaluationSets" + eval_sets_dir.mkdir(exist_ok=True) + + # Save file + file_path = eval_sets_dir / filename + + with open(file_path, "w") as f: + json.dump(eval_set, f, indent=2) + + console.success(f"āœ… Created eval set: {filename}") + self._discover_files() # Refresh + + def _create_eval_set_interactive(self: "InteractiveEvalCLI") -> None: + """Create new evaluation set with comprehensive questions.""" + self._clear_screen() + console.info("āž• Create New Eval Set - Interactive Wizard") + console.info("─" * 65) + + # Basic Information + console.info("šŸ“ Basic Information") + name = input("āž¤ Eval Set Name: ").strip() + if not name: + console.warning("Name is required!") + input("Press Enter to continue...") + return + + # Create clean filename from name + filename = f"{name.lower().replace(' ', '_')}.json" + + # Evaluator References + console.info("\nšŸŽÆ Evaluator References") + console.info("Available evaluators:") + for i, (eval_name, _) in enumerate(self.evaluators, 1): + console.info(f" {i}. {eval_name}") + + evaluator_refs = [] + if self.evaluators: + refs_input = input( + "āž¤ Select evaluators (comma-separated numbers, or 'all'): " + ).strip() + if refs_input.lower() == "all": + evaluator_refs = [ + self._get_evaluator_id(path) for eval_name, path in self.evaluators + ] + elif refs_input: + try: + for num in refs_input.split(","): + idx = int(num.strip()) - 1 + if 0 <= idx < len(self.evaluators): + eval_path = self.evaluators[idx][1] + eval_id = self._get_evaluator_id(eval_path) + evaluator_refs.append(eval_id) + except ValueError: + console.warning("Invalid input, no evaluators selected") + + # Test Cases + console.info("\nšŸ“ Test Cases") + evaluations = [] + test_count = 1 + + while True: + console.info(f"\nTest Case #{test_count}") + test_name = input("āž¤ Test Name (or 'done' to finish): ").strip() + if test_name.lower() == "done": + break + + if not test_name: + console.warning("Test name is required!") + continue + + # Inputs + console.info("šŸ“„ Inputs (JSON format)") + console.info('Examples: {"a": 5, "b": 3} or {"query": "hello world"}') + inputs_str = input("āž¤ Inputs: ").strip() + try: + inputs = json.loads(inputs_str) if inputs_str else {} + except json.JSONDecodeError: + console.warning("Invalid JSON, using empty inputs") + inputs = {} + + # Expected Output + console.info("šŸ“¤ Expected Output (JSON format)") + expected_str = input("āž¤ Expected Output: ").strip() + try: + expected_output = json.loads(expected_str) if expected_str else {} + except json.JSONDecodeError: + console.warning("Invalid JSON, using empty expected output") + expected_output = {} + + evaluation: Dict[str, Any] = { + "id": f"test-{test_count}", + "name": test_name, + "inputs": inputs, + "expectedOutput": expected_output, + "expectedAgentBehavior": "", + "simulationInstructions": "", + "simulateInput": False, + "inputGenerationInstructions": "", + "simulateTools": False, + "toolsToSimulate": [], + "evalSetId": f"eval-{len(self.eval_sets) + 1}", + "createdAt": datetime.now(timezone.utc) + .isoformat() + .replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc) + .isoformat() + .replace("+00:00", "Z"), + } + evaluations.append(evaluation) + test_count += 1 + + if not evaluations: + console.warning("At least one test case is required!") + input("Press Enter to continue...") + return + + # Create eval set + eval_set = { + "id": f"eval-{len(self.eval_sets) + 1}", + "fileName": filename, + "evaluatorRefs": evaluator_refs, + "name": name, + "batchSize": 10, + "timeoutMinutes": 20, + "modelSettings": [], + "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "evaluations": evaluations, + } + + # Ensure evaluationSets directory exists + eval_sets_dir = self.project_root / "evaluationSets" + eval_sets_dir.mkdir(exist_ok=True) + + # Save file + file_path = eval_sets_dir / filename + + try: + with open(file_path, "w") as f: + json.dump(eval_set, f, indent=2) + + console.success(f"\nāœ… Created eval set: {filename}") + console.info(f"šŸ“Š Tests: {len(evaluations)}") + console.info(f"āš™ļø Evaluators: {len(evaluator_refs)}") + + self._discover_files() # Refresh + except Exception as e: + console.error(f"Failed to create eval set: {e}") + + input("\nPress Enter to continue...") + + def _add_evaluations_interactive( + self: "InteractiveEvalCLI", eval_set_id: str + ) -> List[Dict[str, Any]]: + """Add evaluations interactively.""" + evaluations = [] + test_count = 1 + + while True: + console.info(f"\nTest Case #{test_count}") + test_name = self._get_input("Test Name (or 'done' to finish): ") + if test_name.lower() == "done": + break + + if not test_name: + console.warning("Test name is required!") + continue + + # Inputs + console.info("šŸ“„ Inputs (JSON format)") + console.info('Examples: {"a": 5, "b": 3} or {"query": "hello world"}') + inputs_str = input("āž¤ Inputs: ").strip() + try: + inputs = json.loads(inputs_str) if inputs_str else {} + except json.JSONDecodeError: + console.warning("Invalid JSON, using empty inputs") + inputs = {} + + # Expected Output + console.info("šŸ“¤ Expected Output (JSON format)") + expected_str = input("āž¤ Expected Output: ").strip() + try: + expected_output = json.loads(expected_str) if expected_str else {} + except json.JSONDecodeError: + console.warning("Invalid JSON, using empty expected output") + expected_output = {} + + evaluation: Dict[str, Any] = { + "id": f"test-{test_count}", + "name": test_name, + "inputs": inputs, + "expectedOutput": expected_output, + "expectedAgentBehavior": "", + "simulationInstructions": "", + "simulateInput": False, + "inputGenerationInstructions": "", + "simulateTools": False, + "toolsToSimulate": [], + "evalSetId": eval_set_id, + "createdAt": datetime.now(timezone.utc) + .isoformat() + .replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc) + .isoformat() + .replace("+00:00", "Z"), + } + evaluations.append(evaluation) + test_count += 1 + + return evaluations + + def _list_eval_sets(self: "InteractiveEvalCLI") -> None: + """List available eval sets.""" + console.info("\nšŸ“‹ Available Eval Sets:") + if not self.eval_sets: + console.warning("No eval sets found") + return + + for i, (name, path) in enumerate(self.eval_sets, 1): + try: + with open(path) as f: + data = json.load(f) + test_count = len(data.get("evaluations", [])) + evaluator_count = len(data.get("evaluatorRefs", [])) + console.info(f"{i}. {name}") + console.info(f" Tests: {test_count} | Evaluators: {evaluator_count}") + console.info(f" File: {path.name}") + except Exception: + console.info(f"{i}. {name} (error loading)") + + def _show_eval_set_preview(self: "InteractiveEvalCLI", path: Path) -> None: + """Show eval set preview info.""" + try: + with open(path) as f: + data = json.load(f) + test_count = len(data.get("evaluations", [])) + evaluator_count = len(data.get("evaluatorRefs", [])) + console.info(f" šŸ“„ {path.name}") + console.info(f" šŸ“Š Tests: {test_count} | Evaluators: {evaluator_count}") + except Exception: + console.info(f" šŸ“„ {path.name} (error loading)") + + def _show_eval_set_details( + self: "InteractiveEvalCLI", eval_set_tuple: tuple[str, Path] + ) -> None: + """Show detailed eval set view.""" + name, path = eval_set_tuple + self._clear_screen() + console.info(f"šŸ“‹ Eval Set Details: {name}") + console.info("─" * 65) + + try: + with open(path) as f: + data = json.load(f) + + console.info(f"\nšŸ“„ {path.name}") + console.info(f"šŸ†” ID: {data.get('id', 'Unknown')}") + console.info(f"šŸ“Š Tests: {len(data.get('evaluations', []))}") + console.info(f"āš™ļø Evaluators: {len(data.get('evaluatorRefs', []))}") + console.info(f"šŸ“¦ Batch Size: {data.get('batchSize', 'Unknown')}") + console.info(f"ā±ļø Timeout: {data.get('timeoutMinutes', 'Unknown')} minutes") + + evaluator_refs = data.get("evaluatorRefs", []) + if evaluator_refs: + console.info("\nšŸŽÆ Evaluator References:") + for ref in evaluator_refs: + console.info(f" • {ref}") + + evaluations = data.get("evaluations", []) + if evaluations: + console.info("\nšŸ“ Test Cases:") + for i, eval_data in enumerate(evaluations[:10], 1): # Show first 10 + test_name = eval_data.get("name", f"Test {i}") + console.info(f" {i}. {test_name}") + if "inputs" in eval_data: + inputs_preview = str(eval_data["inputs"])[:60] + if len(str(eval_data["inputs"])) > 60: + inputs_preview += "..." + console.info(f" Input: {inputs_preview}") + if "expectedOutput" in eval_data: + output_preview = str(eval_data["expectedOutput"])[:60] + if len(str(eval_data["expectedOutput"])) > 60: + output_preview += "..." + console.info(f" Expected: {output_preview}") + + if len(evaluations) > 10: + console.info(f"\n ... and {len(evaluations) - 10} more tests") + + except Exception as e: + console.error(f"Error loading eval set: {e}") + + console.info("\nšŸ’” Press Backspace to go back") + self._get_key_input() diff --git a/src/uipath/_cli/_interactive/_evaluators.py b/src/uipath/_cli/_interactive/_evaluators.py new file mode 100644 index 000000000..541a5bbf1 --- /dev/null +++ b/src/uipath/_cli/_interactive/_evaluators.py @@ -0,0 +1,293 @@ +"""Evaluator operations for interactive CLI.""" +# type: ignore + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING + +from .._utils._console import ConsoleLogger + +if TYPE_CHECKING: + from ._main import InteractiveEvalCLI + +console = ConsoleLogger() + + +class EvaluatorMixin: + """Mixin for evaluator operations.""" + + def _create_evaluator_simple(self: "InteractiveEvalCLI") -> None: + """Create new evaluator - simplified version.""" + self._clear_screen() + console.info("āž• Create New Evaluator") + console.info("─" * 65) + + name = self._get_input("Name: ") + if not name: + return + + # Create basic evaluator + evaluator = { + "id": f"eval-{name.lower().replace(' ', '-')}", + "name": name, + "description": f"{name} evaluator", + "category": 0, + "type": 1, + "targetOutputKey": "*", + "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + } + + # Ensure evaluators directory exists + evaluators_dir = self.project_root / "evaluators" + evaluators_dir.mkdir(exist_ok=True) + + # Save file + filename = f"{name.lower().replace(' ', '_')}.json" + file_path = evaluators_dir / filename + + with open(file_path, "w") as f: + json.dump(evaluator, f, indent=2) + + console.success(f"āœ… Created evaluator: {filename}") + self._discover_files() # Refresh + + def _create_evaluator_interactive(self: "InteractiveEvalCLI") -> None: + """Create new evaluator with comprehensive questions.""" + self._clear_screen() + console.info("āž• Create New Evaluator - Interactive Wizard") + console.info("─" * 65) + + # Basic Information + console.info("šŸ“ Basic Information") + name = input("āž¤ Evaluator Name: ").strip() + if not name: + console.warning("Name is required!") + input("Press Enter to continue...") + return + + description = input("āž¤ Description: ").strip() or f"{name} evaluator" + + # Category Selection + console.info("\nšŸ·ļø Category Selection") + categories = { + 0: "Deterministic", + 1: "LLM as Judge", + 2: "Agent Scorer", + 3: "Trajectory", + } + + for key, value in categories.items(): + console.info(f" {key}. {value}") + + try: + category = int(input("āž¤ Select Category (0-3): ") or "0") + if category not in categories: + category = 0 + except ValueError: + category = 0 + + # Type Selection + console.info(f"\nšŸŽÆ Type Selection (Category: {categories[category]})") + types = { + 0: "Unknown", + 1: "Exact Match", + 2: "Contains", + 3: "Regex", + 4: "Factuality", + 5: "Custom", + 6: "JSON Similarity", + 7: "Trajectory", + } + + # Show relevant types based on category + relevant_types = [] + if category == 0: # Deterministic + relevant_types = [ + 1, + 2, + 3, + 6, + ] # Exact Match, Contains, Regex, JSON Similarity + elif category == 1: # LLM as Judge + relevant_types = [4, 5] # Factuality, Custom + elif category == 3: # Trajectory + relevant_types = [7] # Trajectory + else: + relevant_types = list(types.keys()) + + for type_id in relevant_types: + console.info(f" {type_id}. {types[type_id]}") + + try: + eval_type = int( + input(f"āž¤ Select Type ({', '.join(map(str, relevant_types))}): ") + or str(relevant_types[0]) + ) + if eval_type not in relevant_types: + eval_type = relevant_types[0] + except (ValueError, IndexError): + eval_type = 1 + + # Target Output Key + console.info("\nšŸ” Target Configuration") + console.info( + "Target Output Key determines which part of the output to evaluate" + ) + console.info("Examples: '*' (all), 'result', 'answer', 'output'") + target_key = input("āž¤ Target Output Key (default: '*'): ").strip() or "*" + + # Create basic evaluator + evaluator = { + "id": f"eval-{name.lower().replace(' ', '-')}", + "name": name, + "description": description, + "category": category, + "type": eval_type, + "targetOutputKey": target_key, + "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "updatedAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + } + + # LLM Configuration (if LLM as Judge) + if category == 1: # LLM as Judge + console.info("\nšŸ¤– LLM Configuration") + model_name = input("āž¤ Model Name (default: gpt-4): ").strip() or "gpt-4" + + console.info("šŸ“ Evaluation Prompt") + console.info("This prompt will be used to evaluate the agent's output") + prompt = input("āž¤ Evaluation Prompt: ").strip() + + if prompt: + evaluator["llmConfig"] = { + "modelName": model_name, + "prompt": prompt, + "temperature": 0.0, + "maxTokens": 1000, + } + + # Ensure evaluators directory exists + evaluators_dir = self.project_root / "evaluators" + evaluators_dir.mkdir(exist_ok=True) + + # Save file + filename = f"{name.lower().replace(' ', '_')}.json" + file_path = evaluators_dir / filename + + try: + with open(file_path, "w") as f: + json.dump(evaluator, f, indent=2) + + console.success(f"\nāœ… Created evaluator: {filename}") + console.info(f"šŸ·ļø Category: {categories[category]}") + console.info(f"šŸŽÆ Type: {types[eval_type]}") + console.info(f"šŸ” Target: {target_key}") + + self._discover_files() # Refresh + except Exception as e: + console.error(f"Failed to create evaluator: {e}") + + input("\nPress Enter to continue...") + + def _list_evaluators(self: "InteractiveEvalCLI") -> None: + """List available evaluators.""" + console.info("\nāš™ļø Available Evaluators:") + if not self.evaluators: + console.warning("No evaluators found") + return + + for i, (name, path) in enumerate(self.evaluators, 1): + try: + with open(path) as f: + data = json.load(f) + category = self._get_category_name(data.get("category", 0)) + type_name = self._get_type_name(data.get("type", 1)) + console.info(f"{i}. {name}") + console.info(f" Type: {category} | {type_name}") + console.info(f" File: {path.name}") + except Exception: + console.info(f"{i}. {name} (error loading)") + + def _show_evaluator_preview(self: "InteractiveEvalCLI", path: Path) -> None: + """Show evaluator preview info.""" + try: + with open(path) as f: + data = json.load(f) + category = self._get_category_name(data.get("category", 0)) + type_name = self._get_type_name(data.get("type", 1)) + console.info(f" šŸ“„ {path.name}") + console.info(f" šŸŽÆ Type: {category} | {type_name}") + except Exception: + console.info(f" šŸ“„ {path.name} (error loading)") + + def _show_evaluator_details( + self: "InteractiveEvalCLI", evaluator_tuple: tuple[str, Path] + ) -> None: + """Show detailed evaluator view.""" + name, path = evaluator_tuple + self._clear_screen() + console.info(f"āš™ļø Evaluator Details: {name}") + console.info("─" * 65) + + try: + with open(path) as f: + data = json.load(f) + + console.info(f"\nšŸ“„ {path.name}") + console.info(f"šŸ†” ID: {data.get('id', 'Unknown')}") + console.info(f"šŸ“ Description: {data.get('description', 'No description')}") + console.info( + f"šŸ·ļø Category: {self._get_category_name(data.get('category', 0))}" + ) + console.info(f"šŸŽÆ Type: {self._get_type_name(data.get('type', 1))}") + console.info(f"šŸ” Target Key: {data.get('targetOutputKey', '*')}") + + if "llmConfig" in data: + llm_config = data["llmConfig"] + console.info("\nšŸ¤– LLM Configuration:") + console.info(f" Model: {llm_config.get('modelName', 'Unknown')}") + if "prompt" in llm_config: + prompt_preview = llm_config["prompt"][:100] + if len(llm_config["prompt"]) > 100: + prompt_preview += "..." + console.info(f" Prompt: {prompt_preview}") + + except Exception as e: + console.error(f"Error loading evaluator: {e}") + + console.info("\nšŸ’” Press Backspace to go back") + self._get_key_input() + + def _get_category_name(self: "InteractiveEvalCLI", category: int) -> str: + """Get category name from number.""" + categories = { + 0: "Deterministic", + 1: "LLM as Judge", + 2: "Agent Scorer", + 3: "Trajectory", + } + return categories.get(category, "Unknown") + + def _get_type_name(self: "InteractiveEvalCLI", eval_type: int) -> str: + """Get type name from number.""" + types = { + 0: "Unknown", + 1: "Exact Match", + 2: "Contains", + 3: "Regex", + 4: "Factuality", + 5: "Custom", + 6: "JSON Similarity", + 7: "Trajectory", + } + return types.get(eval_type, "Unknown") + + def _get_evaluator_id(self: "InteractiveEvalCLI", path: Path) -> str: + """Get evaluator ID from file.""" + try: + with open(path) as f: + data = json.load(f) + return data.get("id", path.stem) + except Exception: + return path.stem diff --git a/src/uipath/_cli/_interactive/_execution.py b/src/uipath/_cli/_interactive/_execution.py new file mode 100644 index 000000000..8152fb3a5 --- /dev/null +++ b/src/uipath/_cli/_interactive/_execution.py @@ -0,0 +1,153 @@ +"""Execution utilities for running evaluations.""" +# type: ignore + +import subprocess +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Optional + +from .._utils._console import ConsoleLogger + +if TYPE_CHECKING: + from ._main import InteractiveEvalCLI + +console = ConsoleLogger() + + +class ExecutionMixin: + """Mixin for execution operations.""" + + def _execute_evaluation(self: "InteractiveEvalCLI", eval_path: Path) -> None: + """Execute evaluation with live results.""" + console.info("\nšŸš€ Running evaluation...") + + # Find main.py + main_py = self._find_main_py() + if not main_py: + console.error("Could not find main.py") + return + + # Build command - run from the project directory + cmd = [ + sys.executable, + "-m", + "uipath._cli.cli_eval", + str(main_py.relative_to(self.project_root)), + str(eval_path.relative_to(self.project_root)), + "--no-report", + "--workers", + "1", + ] + + console.info( + f"šŸ’» Command: uipath eval {main_py.name} {eval_path.name} --no-report" + ) + + try: + # Run with real-time output from project directory + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + cwd=self.project_root, + ) + + # Stream output in real-time + if process.stdout: + for line in process.stdout: + print(line.rstrip()) + + process.wait() + + if process.returncode == 0: + console.success("\nāœ… Evaluation completed successfully!") + else: + console.error( + f"\nāŒ Evaluation failed (exit code: {process.returncode})" + ) + + except Exception as e: + console.error(f"Failed to run evaluation: {e}") + + def _execute_evaluation_no_clear( + self: "InteractiveEvalCLI", eval_path: Path + ) -> None: + """Execute evaluation without clearing screen.""" + console.info("\nšŸš€ Running evaluation...") + + # Find main.py + main_py = self._find_main_py() + if not main_py: + console.error("Could not find main.py") + input("\nPress Enter to continue...") + return + + # Build command - run from the project directory + cmd = [ + sys.executable, + "-m", + "uipath._cli.cli_eval", + str(main_py.relative_to(self.project_root)), + str(eval_path.relative_to(self.project_root)), + "--no-report", + "--workers", + "1", + ] + + console.info( + f"šŸ’» Command: uipath eval {main_py.name} {eval_path.name} --no-report" + ) + + try: + # Run with real-time output from project directory + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + universal_newlines=True, + cwd=self.project_root, + ) + + # Stream output in real-time + if process.stdout: + for line in process.stdout: + print(line.rstrip()) + + process.wait() + + if process.returncode == 0: + console.success("\nāœ… Evaluation completed successfully!") + else: + console.error( + f"\nāŒ Evaluation failed (exit code: {process.returncode})" + ) + + except Exception as e: + console.error(f"Failed to run evaluation: {e}") + + input("\nPress Enter to continue...") + + def _find_main_py(self: "InteractiveEvalCLI") -> Optional[Path]: + """Find main.py file.""" + # Check current directory + main_py = self.project_root / "main.py" + if main_py.exists(): + return main_py + + # Check parent directories + for parent in self.project_root.parents: + main_py = parent / "main.py" + if main_py.exists(): + return main_py + + return None + + def _confirm(self: "InteractiveEvalCLI", prompt: str) -> bool: + """Ask for confirmation.""" + response = self._get_input(f"{prompt} (y/n): ").lower() + return response in ["y", "yes"] diff --git a/src/uipath/_cli/_interactive/_main.py b/src/uipath/_cli/_interactive/_main.py new file mode 100644 index 000000000..c41f4023a --- /dev/null +++ b/src/uipath/_cli/_interactive/_main.py @@ -0,0 +1,199 @@ +"""Main interactive CLI for evaluations.""" + +from pathlib import Path +from typing import List, Optional, Tuple + +from .._utils._console import ConsoleLogger +from ._discovery import DiscoveryMixin +from ._drill_down import DrillDownMixin +from ._eval_sets import EvalSetMixin +from ._evaluators import EvaluatorMixin +from ._execution import ExecutionMixin +from ._navigation import HAS_NAVIGATION, NavigationMixin + +console = ConsoleLogger() + + +class InteractiveEvalCLI( + NavigationMixin, + DiscoveryMixin, + EvalSetMixin, + EvaluatorMixin, + ExecutionMixin, + DrillDownMixin, +): + """Simple, fast, keyboard-driven evaluation CLI.""" + + def __init__(self, project_root: Optional[Path] = None): + self.project_root = project_root or Path.cwd() + self.eval_sets: List[Tuple[str, Path]] = [] + self.evaluators: List[Tuple[str, Path]] = [] + self.current_selection = 0 + self.menu_items = [ + "šŸ“‹ List eval sets", + "āš™ļø List evaluators", + "⚔ Quick run (auto-select)", + "āž• Create eval set", + "āž• Create evaluator", + "šŸŽÆ Run specific combination", + ] + self._discover_files() + + def run(self) -> None: + """Run the interactive CLI.""" + self._show_ascii_art() + + if not HAS_NAVIGATION: + console.warning( + "āš ļø Terminal navigation not available. Using fallback mode." + ) + console.info("Consider using a standard terminal for better experience.\n") + self._run_fallback_mode() + return + + try: + self._run_navigation_mode() + except KeyboardInterrupt: + console.info("\nšŸ‘‹ Goodbye!") + + def _run_navigation_mode(self) -> None: + """Run with arrow key navigation.""" + while True: + self._clear_screen() + self._show_ascii_art() + self._show_menu(self.current_selection, self.menu_items) + + key = self._get_key_input() + + if key == "up": + self.current_selection = (self.current_selection - 1) % len( + self.menu_items + ) + elif key == "down": + self.current_selection = (self.current_selection + 1) % len( + self.menu_items + ) + elif key in ["enter", " "]: + self._execute_menu_item_with_navigation(self.current_selection) + elif key.isdigit() and 1 <= int(key) <= 6: + self._execute_menu_item_with_navigation(int(key) - 1) + + def _execute_menu_item_with_navigation(self, index: int) -> None: + """Execute menu item with navigation support.""" + if index == 0: + self._drill_down_eval_sets() + elif index == 1: + self._drill_down_evaluators() + elif index == 2: + self._quick_run_with_navigation() + elif index == 3: + self._create_eval_set_interactive() + elif index == 4: + self._create_evaluator_interactive() + elif index == 5: + self._run_specific_combination() + + def _run_fallback_mode(self) -> None: + """Run without navigation - simple text interface.""" + while True: + console.info("\nāš™ļø Main Menu:") + for i, item in enumerate(self.menu_items, 1): + console.info(f" {i}. {item}") + console.info(" 0. Exit") + + try: + choice = input("\nāž¤ Select option: ").strip() + + if choice == "0": + console.info("šŸ‘‹ Goodbye!") + break + elif choice == "1": + self._list_eval_sets_navigation() + elif choice == "2": + self._list_evaluators() + elif choice == "3": + self._quick_run() + elif choice == "4": + self._create_eval_set_simple() + elif choice == "5": + self._create_evaluator_simple() + elif choice == "6": + self._run_specific_combination() + else: + console.warning("Invalid option") + except KeyboardInterrupt: + console.info("\nšŸ‘‹ Goodbye!") + break + + def _quick_run_with_navigation(self) -> None: + """Quick run evaluation with auto-selected eval set.""" + if not self.eval_sets: + self._clear_screen() + console.warning("No eval sets found!") + console.info("Press Enter to go back...") + self._get_input("") + return + + # Use first eval set + eval_name, eval_path = self.eval_sets[0] + + self._clear_screen() + console.info(f"⚔ Quick Run: {eval_name}") + console.info("─" * 65) + + if self._confirm("Run evaluation now?"): + self._execute_evaluation_no_clear(eval_path) + + def _quick_run(self) -> None: + """Quick run evaluation with auto-selected eval set.""" + if not self.eval_sets: + console.warning("No eval sets found!") + return + + # Use first eval set + eval_name, eval_path = self.eval_sets[0] + console.info(f"\n⚔ Quick Run: {eval_name}") + + if self._confirm("Run evaluation now?"): + self._execute_evaluation(eval_path) + + def _list_eval_sets_navigation(self) -> None: + """List eval sets with navigation.""" + self._clear_screen() + console.info("šŸ“‹ Available Eval Sets") + console.info("─" * 65) + self._list_eval_sets() + input("\nPress Enter to continue...") + + def _run_specific_combination(self) -> None: + """Run specific eval set and evaluator combination.""" + self._clear_screen() + console.info("šŸŽÆ Run Specific Combination") + console.info("─" * 65) + + # Select eval set + console.info("\nšŸ“‹ Select Eval Set:") + for i, (name, _) in enumerate(self.eval_sets, 1): + console.info(f" {i}. {name}") + + try: + eval_idx = int(input("\nāž¤ Eval Set Number: ").strip()) - 1 + if not (0 <= eval_idx < len(self.eval_sets)): + console.error("Invalid selection") + input("\nPress Enter to continue...") + return + + eval_name, eval_path = self.eval_sets[eval_idx] + + console.info(f"\nāœ… Selected: {eval_name}") + if self._confirm("Run evaluation now?"): + self._execute_evaluation_no_clear(eval_path) + except ValueError: + console.error("Invalid selection") + input("\nPress Enter to continue...") + + +def launch_interactive_cli(project_root: Optional[Path] = None) -> None: + """Launch the interactive CLI.""" + cli = InteractiveEvalCLI(project_root) + cli.run() diff --git a/src/uipath/_cli/_interactive/_navigation.py b/src/uipath/_cli/_interactive/_navigation.py new file mode 100644 index 000000000..4f8077ca0 --- /dev/null +++ b/src/uipath/_cli/_interactive/_navigation.py @@ -0,0 +1,109 @@ +"""Navigation and input handling for interactive CLI.""" + +import sys +import termios +import tty + +from .._utils._console import ConsoleLogger + +console = ConsoleLogger() + + +def has_termios() -> bool: + """Check if we have termios support for advanced input.""" + try: + termios.tcgetattr(sys.stdin) + return True + except Exception: + return False + + +HAS_NAVIGATION = has_termios() + + +class NavigationMixin: + """Mixin for navigation and input handling.""" + + def _clear_screen(self) -> None: + """Clear the screen.""" + print("\033[2J\033[H", end="") + + def _get_input(self, prompt: str) -> str: + """Get input from user.""" + return input(prompt).strip() + + def _get_key_input(self) -> str: + """Get key input with arrow key support.""" + if not HAS_NAVIGATION: + return input("āž¤ ").strip().lower() + + old_settings = termios.tcgetattr(sys.stdin) + try: + tty.setraw(sys.stdin) + + # Read first character + char = sys.stdin.read(1) + + # Check for escape sequences (arrow keys) + if char == "\x1b": # ESC + next_char = sys.stdin.read(1) + if next_char == "[": + arrow = sys.stdin.read(1) + if arrow == "A": + return "up" + elif arrow == "B": + return "down" + return "" + + # Backspace handling + if char == "\x7f": # Backspace (DEL) + return "back" + elif char == "\x08": # Backspace (BS) + return "back" + + # Enter key + if char in ["\r", "\n"]: + return "enter" + + # Digit keys + elif char.isdigit() and 1 <= int(char) <= 6: + return char + elif char == "\x03": # Ctrl+C + raise KeyboardInterrupt + + return "" + except Exception: + return input("āž¤ ").strip().lower() + finally: + # Restore terminal settings + try: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) + except Exception: + pass + + def _show_ascii_art(self) -> None: + """Display ASCII art banner.""" + art = """ + ā–ˆā–ˆā•— ā–ˆā–ˆā•—ā–ˆā–ˆā•—ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā•— ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•—ā–ˆā–ˆā•— ā–ˆā–ˆā•— + ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•—ā•šā•ā•ā–ˆā–ˆā•”ā•ā•ā•ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ + ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•‘ + ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ā–ˆā–ˆā•”ā•ā•ā•ā• ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•”ā•ā•ā–ˆā–ˆā•‘ + ā•šā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā•”ā•ā–ˆā–ˆā•‘ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ ā–ˆā–ˆā•‘ + ā•šā•ā•ā•ā•ā•ā• ā•šā•ā•ā•šā•ā• ā•šā•ā• ā•šā•ā• ā•šā•ā• ā•šā•ā• ā•šā•ā• + + Evaluation Builder + Interactive Evaluation Toolkit + """ + console.info(art) + + def _show_menu(self, current_selection: int, menu_items: list[str]) -> None: + """Show menu with current selection highlighted.""" + console.info("\nāš™ļø Main Menu:") + console.info("─" * 65) + for i, item in enumerate(menu_items): + if i == current_selection: + console.info(f" ā–¶ {item}") + else: + console.info(f" {item}") + console.info("\nšŸ’” Use ↑/↓ arrows to navigate, Enter to select, or type 1-6") + console.info("Press Ctrl+C to exit") diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index 9e95d0c71..53d55e216 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -13,7 +13,7 @@ class EvalHelpers: @staticmethod def auto_discover_eval_set() -> str: - """Auto-discover evaluation set from evals/eval-sets directory. + """Auto-discover evaluation set from evaluationSets or evals/eval-sets directory. Returns: Path to the evaluation set file @@ -21,19 +21,24 @@ def auto_discover_eval_set() -> str: Raises: ValueError: If no eval set found or multiple eval sets exist """ - eval_sets_dir = Path("evals/eval-sets") + # Try evaluationSets folder first (new structure) + eval_sets_dir = Path("evaluationSets") + + # Fall back to evals/eval-sets (old structure) + if not eval_sets_dir.exists(): + eval_sets_dir = Path("evals/eval-sets") if not eval_sets_dir.exists(): raise ValueError( - "No 'evals/eval-sets' directory found. " - "Please set 'UIPATH_PROJECT_ID' env var and run 'uipath pull'." + "No 'evaluationSets' or 'evals/eval-sets' directory found. " + "Please create an evaluation set or set 'UIPATH_PROJECT_ID' env var and run 'uipath pull'." ) eval_set_files = list(eval_sets_dir.glob("*.json")) if not eval_set_files: raise ValueError( - "No evaluation set files found in 'evals/eval-sets' directory. " + f"No evaluation set files found in '{eval_sets_dir}' directory. " ) if len(eval_set_files) > 1: diff --git a/src/uipath/_cli/cli_eval.py b/src/uipath/_cli/cli_eval.py index 53dd3bc12..bfdcdbcde 100644 --- a/src/uipath/_cli/cli_eval.py +++ b/src/uipath/_cli/cli_eval.py @@ -31,6 +31,57 @@ console = ConsoleLogger() +def _display_local_results(results_data): + """Display evaluation results locally in a formatted way.""" + if not results_data: + return + + evaluation_set_name = results_data.get("evaluationSetName", "Unknown") + overall_score = results_data.get("score", 0.0) + evaluation_results = results_data.get("evaluationSetResults", []) + + console.info(f"\nšŸŽÆ Evaluation Report: {evaluation_set_name}") + console.info(f"šŸ“Š Overall Score: {overall_score:.1f}%") + console.info("=" * 60) + + passed_count = 0 + total_count = len(evaluation_results) + + for i, test in enumerate(evaluation_results, 1): + test_score = test.get("score", 0.0) + test_name = test.get("evaluationName", f"Test {i}") + + if test_score == 100.0: + status = "āœ… PASS" + passed_count += 1 + elif test_score == 0.0: + status = "āŒ FAIL" + else: + status = "āš ļø PARTIAL" + passed_count += 0.5 # Partial credit + + console.info(f"\n{i}. {test_name}: {status} ({test_score:.1f}%)") + + evaluator_results = test.get("evaluationRunResults", []) + for evaluator_result in evaluator_results: + evaluator_name = evaluator_result.get("evaluatorName", "Unknown Evaluator") + result = evaluator_result.get("result", {}) + score = result.get("score", 0.0) + eval_time = result.get("evaluationTime", 0.0) + console.info( + f" └─ {evaluator_name}: {score:.1f}% ({eval_time * 1000:.2f}ms)" + ) + + console.info(f"\nšŸŽÆ Summary: {int(passed_count)}/{total_count} tests passed") + if overall_score == 100.0: + console.success("šŸŽ‰ All tests passed!") + elif overall_score == 0.0: + console.info("šŸ’„ All tests failed!") + else: + console.info(f"⚔ Partial success: {overall_score:.1f}% overall score") + console.info("") + + class LiteralOption(click.Option): def type_cast_value(self, ctx, value): try: @@ -61,6 +112,12 @@ def type_cast_value(self, ctx, value): type=click.Path(exists=False), help="File path where the output will be written", ) +@click.option( + "--interactive", + is_flag=True, + help="Launch streamlined keyboard-only interactive CLI", + default=False, +) @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None) def eval( entrypoint: Optional[str], @@ -69,6 +126,7 @@ def eval( no_report: bool, workers: int, output_file: Optional[str], + interactive: bool, ) -> None: """Run an evaluation set against the agent. @@ -78,7 +136,21 @@ def eval( eval_ids: Optional list of evaluation IDs workers: Number of parallel workers for running evaluations no_report: Do not report the evaluation results + interactive: Launch streamlined keyboard-only interactive CLI """ + # Handle interactive mode + if interactive: + try: + from ._interactive import launch_interactive_cli + + launch_interactive_cli() + return + except ImportError as e: + console.error(f"Interactive mode requires additional dependencies: {e}") + return + except Exception as e: + console.error(f"Failed to launch interactive mode: {e}") + return if not no_report and not os.getenv("UIPATH_FOLDER_KEY"): os.environ["UIPATH_FOLDER_KEY"] = asyncio.run( get_personal_workspace_key_async() @@ -131,16 +203,24 @@ def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext: if eval_context.job_id: runtime_factory.add_span_exporter(LlmOpsHttpExporter()) + eval_runtime_ref = None + async def execute(): + nonlocal eval_runtime_ref async with UiPathEvalRuntime.from_eval_context( factory=runtime_factory, context=eval_context, event_bus=event_bus, ) as eval_runtime: + eval_runtime_ref = eval_runtime await eval_runtime.execute() await event_bus.wait_for_all(timeout=10) asyncio.run(execute()) + + # Display results locally when --no-report is used + if no_report and eval_runtime_ref and eval_runtime_ref.context.result: + _display_local_results(eval_runtime_ref.context.result.output) except Exception as e: console.error( f"Error: Unexpected error occurred - {str(e)}", include_traceback=True