Skip to content

Commit ed4e832

Browse files
committed
- Add PATH argument to 'evalview run' for direct path usage
1 parent 22a5799 commit ed4e832

File tree

3 files changed

+125
-26
lines changed

3 files changed

+125
-26
lines changed

evalview/cli.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,7 @@ async def _init_wizard_async(dir: str):
10641064

10651065

10661066
@main.command()
1067+
@click.argument("path", required=False, default=None)
10671068
@click.option(
10681069
"--pattern",
10691070
default="*.yaml",
@@ -1139,6 +1140,7 @@ async def _init_wizard_async(dir: str):
11391140
help="Generate HTML report to specified path",
11401141
)
11411142
def run(
1143+
path: Optional[str],
11421144
pattern: str,
11431145
test: tuple,
11441146
filter: str,
@@ -1154,14 +1156,19 @@ def run(
11541156
watch: bool,
11551157
html_report: str,
11561158
):
1157-
"""Run test cases against the agent."""
1159+
"""Run test cases against the agent.
1160+
1161+
PATH can be a directory containing test cases (e.g., examples/anthropic)
1162+
or a specific test file (e.g., examples/anthropic/test-case.yaml).
1163+
"""
11581164
asyncio.run(_run_async(
1159-
pattern, test, filter, output, verbose, track, compare_baseline, debug,
1165+
path, pattern, test, filter, output, verbose, track, compare_baseline, debug,
11601166
sequential, max_workers, max_retries, retry_delay, watch, html_report
11611167
))
11621168

11631169

11641170
async def _run_async(
1171+
path: Optional[str],
11651172
pattern: str,
11661173
test: tuple,
11671174
filter: str,
@@ -1185,6 +1192,13 @@ async def _run_async(
11851192
from evalview.core.retry import RetryConfig, with_retry
11861193
from evalview.core.config import ScoringWeights
11871194

1195+
# Load environment variables from path directory if provided
1196+
if path:
1197+
target_dir = Path(path) if Path(path).is_dir() else Path(path).parent
1198+
path_env = target_dir / ".env.local"
1199+
if path_env.exists():
1200+
load_dotenv(dotenv_path=str(path_env), override=True)
1201+
11881202
# Interactive provider selection for LLM-as-judge
11891203
result = get_or_select_provider(console)
11901204
if result is None:
@@ -1235,10 +1249,25 @@ async def _run_async(
12351249

12361250
console.print("[blue]Running test cases...[/blue]\n")
12371251

1238-
# Load config
1239-
config_path = Path(".evalview/config.yaml")
1252+
# Load config - check path directory first, then current directory
1253+
config_path = None
1254+
if path:
1255+
# Check for config in the provided path directory
1256+
target_dir = Path(path) if Path(path).is_dir() else Path(path).parent
1257+
path_config = target_dir / ".evalview" / "config.yaml"
1258+
if path_config.exists():
1259+
config_path = path_config
1260+
if verbose:
1261+
console.print(f"[dim]📂 Using config from: {path_config}[/dim]")
1262+
1263+
# Fall back to current directory config
1264+
if config_path is None:
1265+
config_path = Path(".evalview/config.yaml")
1266+
12401267
if not config_path.exists():
12411268
console.print("[red]❌ Config file not found. Run 'evalview init' first.[/red]")
1269+
if path:
1270+
console.print(f"[dim]Looked in: {Path(path) / '.evalview/config.yaml'} and .evalview/config.yaml[/dim]")
12421271
return
12431272

12441273
with open(config_path) as f:
@@ -1357,9 +1386,30 @@ async def _run_async(
13571386
tracker = RegressionTracker()
13581387

13591388
# Load test cases
1389+
# Priority: 1. path argument, 2. pattern option, 3. default tests/test-cases/
1390+
1391+
# Check if path argument is provided (e.g., evalview run examples/anthropic)
1392+
if path:
1393+
target_path = Path(path)
1394+
if target_path.exists() and target_path.is_file():
1395+
# Load single file directly
1396+
try:
1397+
test_cases = [TestCaseLoader.load_from_file(target_path)]
1398+
if verbose:
1399+
console.print(f"[dim]📄 Loading test case from: {path}[/dim]\n")
1400+
except Exception as e:
1401+
console.print(f"[red]❌ Failed to load test case: {e}[/red]")
1402+
return
1403+
elif target_path.exists() and target_path.is_dir():
1404+
# Load all YAML files from specified directory
1405+
test_cases = TestCaseLoader.load_from_directory(target_path, "*.yaml")
1406+
if verbose:
1407+
console.print(f"[dim]📁 Loading test cases from: {path}[/dim]\n")
1408+
else:
1409+
console.print(f"[red]❌ Path not found: {path}[/red]")
1410+
return
13601411
# Check if pattern is a direct file path
1361-
pattern_path = Path(pattern)
1362-
if pattern_path.exists() and pattern_path.is_file():
1412+
elif (pattern_path := Path(pattern)).exists() and pattern_path.is_file():
13631413
# Load single file directly
13641414
try:
13651415
test_cases = [TestCaseLoader.load_from_file(pattern_path)]
@@ -1378,8 +1428,9 @@ async def _run_async(
13781428
test_cases_dir = Path("tests/test-cases")
13791429
if not test_cases_dir.exists():
13801430
console.print("[red]❌ Test cases directory not found: tests/test-cases[/red]")
1381-
console.print("[dim]Tip: You can also specify a file path directly:[/dim]")
1382-
console.print("[dim] evalview run --pattern path/to/test-case.yaml[/dim]")
1431+
console.print("[dim]Tip: You can specify a path or file directly:[/dim]")
1432+
console.print("[dim] evalview run examples/anthropic[/dim]")
1433+
console.print("[dim] evalview run path/to/test-case.yaml[/dim]")
13831434
return
13841435
test_cases = TestCaseLoader.load_from_directory(test_cases_dir, pattern)
13851436

@@ -1610,8 +1661,13 @@ async def execute_single_test(test_case):
16101661
"""Execute a single test case with optional retry logic."""
16111662
test_adapter = get_adapter_for_test(test_case)
16121663

1664+
# Merge test case tools into context for adapters that support them
1665+
context = dict(test_case.input.context) if test_case.input.context else {}
1666+
if hasattr(test_case, 'tools') and test_case.tools:
1667+
context['tools'] = test_case.tools
1668+
16131669
async def _execute():
1614-
return await test_adapter.execute(test_case.input.query, test_case.input.context)
1670+
return await test_adapter.execute(test_case.input.query, context)
16151671

16161672
# Execute with retry if configured
16171673
if retry_config.max_retries > 0:

evalview/core/types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ class TestCase(BaseModel):
113113
endpoint: Optional[str] = None # e.g., "http://127.0.0.1:2024"
114114
adapter_config: Optional[Dict[str, Any]] = None # Additional adapter settings
115115

116+
# Optional: Tool definitions for adapters that support them (e.g., Anthropic, OpenAI)
117+
# Each tool should have: name, description, input_schema
118+
tools: Optional[List[Dict[str, Any]]] = None
119+
116120

117121
# ============================================================================
118122
# Execution Trace Types

examples/anthropic/README.md

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,63 @@
22

33
Test Anthropic Claude models with tool use in EvalView.
44

5-
## Setup
5+
## Quick Start
66

7-
### 1. Set Environment Variables
7+
### First Time Setup
88

99
```bash
10+
# 1. Go to EvalView root directory
11+
cd /path/to/EvalView
12+
13+
# 2. Create virtual environment
14+
python3 -m venv venv
15+
16+
# 3. Activate it
17+
source venv/bin/activate
18+
19+
# 4. Install EvalView + Anthropic SDK
20+
pip install -e .
21+
pip install anthropic
22+
23+
# 5. Set your API key (in .env.local or environment)
24+
export ANTHROPIC_API_KEY=your-api-key
25+
# Or add to examples/anthropic/.env.local
26+
27+
# 6. Run the test (from root directory)
28+
evalview run examples/anthropic
29+
```
30+
31+
### Already Set Up?
32+
33+
```bash
34+
cd /path/to/EvalView
35+
source venv/bin/activate
36+
evalview run examples/anthropic
37+
```
38+
39+
That's it! EvalView will:
40+
- Auto-detect your Anthropic API key
41+
- Run the test case against Claude
42+
- Evaluate tool usage and output quality
43+
44+
## Setup Details
45+
46+
### Environment Variables
47+
48+
```bash
49+
# Required: Anthropic API key
1050
export ANTHROPIC_API_KEY=your-api-key
51+
52+
# Optional: Choose evaluation provider (if you have multiple API keys)
53+
export EVAL_PROVIDER=anthropic # or openai, gemini, grok
54+
55+
# Optional: Override evaluation model
56+
export EVAL_MODEL=claude-haiku-4-5-20251001 # faster/cheaper for eval
1157
```
1258

13-
### 2. Define Tools (Optional)
59+
### Custom Tool Executor (Optional)
1460

15-
For tool use testing, define tools in your test case or create a tool executor:
61+
For real tool execution, create a tool executor:
1662

1763
```python
1864
# tools.py
@@ -21,22 +67,15 @@ def execute_tool(name: str, input: dict) -> str:
2167
if name == "get_weather":
2268
city = input.get("city", "unknown")
2369
return f"Weather in {city}: 72F, sunny"
24-
elif name == "calculator":
25-
op = input.get("operation")
26-
a, b = input.get("a", 0), input.get("b", 0)
27-
if op == "add":
28-
return str(a + b)
29-
elif op == "multiply":
30-
return str(a * b)
70+
elif name == "convert_temperature":
71+
value = input.get("value", 0)
72+
from_unit = input.get("from_unit", "celsius")
73+
if from_unit == "celsius":
74+
return str(value * 9/5 + 32) + "F"
75+
return str((value - 32) * 5/9) + "C"
3176
return f"Unknown tool: {name}"
3277
```
3378

34-
### 3. Run EvalView Test
35-
36-
```bash
37-
evalview run --pattern examples/anthropic/test-case.yaml
38-
```
39-
4079
## Supported Models (November 2025)
4180

4281
| Model | API ID | Input/MTok | Output/MTok |

0 commit comments

Comments
 (0)