Skip to content

Commit 22a5799

Browse files
committed
Add multi-provider LLM support for evaluation (OpenAI, Anthropic, Gemini, Grok)
1 parent 6d38a79 commit 22a5799

File tree

7 files changed

+759
-116
lines changed

7 files changed

+759
-116
lines changed

evalview/cli.py

Lines changed: 53 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@
1616

1717
from evalview.core.loader import TestCaseLoader
1818
from evalview.core.pricing import get_model_pricing_info
19+
from evalview.core.llm_provider import (
20+
detect_available_providers,
21+
get_missing_provider_message,
22+
get_provider_status,
23+
get_or_select_provider,
24+
save_provider_preference,
25+
PROVIDER_CONFIGS,
26+
LLMProvider,
27+
)
1928
from evalview.adapters.http_adapter import HTTPAdapter
2029
from evalview.adapters.tapescope_adapter import TapeScopeAdapter
2130
from evalview.adapters.langgraph_adapter import LangGraphAdapter
@@ -364,8 +373,9 @@ async def health():
364373
console.print("\n[bold]1. Start the demo agent:[/bold]")
365374
console.print(" [cyan]pip install fastapi uvicorn[/cyan]")
366375
console.print(" [cyan]python demo-agent/agent.py[/cyan]")
367-
console.print("\n[bold]2. In another terminal, run tests:[/bold]")
368-
console.print(" [cyan]export OPENAI_API_KEY='your-key-here'[/cyan]")
376+
console.print("\n[bold]2. In another terminal, set an API key (any one):[/bold]")
377+
console.print(" [cyan]export ANTHROPIC_API_KEY='your-key'[/cyan] [dim]# or OPENAI_API_KEY, GEMINI_API_KEY, XAI_API_KEY[/dim]")
378+
console.print("\n[bold]3. Run tests:[/bold]")
369379
console.print(" [cyan]evalview run[/cyan]")
370380
console.print("\n[dim]The demo agent runs on http://localhost:8000[/dim]")
371381
console.print("[dim]Edit tests/test-cases/example.yaml to add more tests[/dim]\n")
@@ -494,11 +504,13 @@ def quickstart():
494504
else:
495505
console.print("[bold]Step 3/4:[/bold] Config already exists\n")
496506

497-
# Check for OPENAI_API_KEY
498-
if not os.getenv("OPENAI_API_KEY"):
499-
console.print("[yellow]⚠️ OPENAI_API_KEY not set[/yellow]")
500-
console.print("\nTo complete the quickstart, set your OpenAI API key:")
501-
console.print(" [cyan]export OPENAI_API_KEY='your-key-here'[/cyan]\n")
507+
# Check for any LLM provider API key
508+
available_providers = detect_available_providers()
509+
if not available_providers:
510+
console.print("[yellow]⚠️ No LLM provider API key set[/yellow]")
511+
console.print("\nTo complete the quickstart, set at least one API key:")
512+
console.print(" [cyan]export ANTHROPIC_API_KEY='your-key'[/cyan] [dim]# recommended[/dim]")
513+
console.print(" [dim]# or: OPENAI_API_KEY, GEMINI_API_KEY, XAI_API_KEY[/dim]\n")
502514
console.print("Then run this command again.\n")
503515
return
504516

@@ -1173,18 +1185,21 @@ async def _run_async(
11731185
from evalview.core.retry import RetryConfig, with_retry
11741186
from evalview.core.config import ScoringWeights
11751187

1176-
# Validate OPENAI_API_KEY upfront (required for LLM-as-judge evaluation)
1177-
openai_api_key = os.getenv("OPENAI_API_KEY")
1178-
if not openai_api_key:
1179-
console.print("\n[red bold]❌ Error: OPENAI_API_KEY is required[/red bold]\n")
1180-
console.print("EvalView uses LLM-as-judge to evaluate output quality.")
1181-
console.print("Please set your OpenAI API key:\n")
1182-
console.print(" [cyan]export OPENAI_API_KEY='your-key-here'[/cyan]")
1183-
console.print("\nOr add it to your .env file:")
1184-
console.print(" [cyan]echo 'OPENAI_API_KEY=your-key-here' >> .env[/cyan]\n")
1185-
console.print("[dim]Get your API key at: https://platform.openai.com/api-keys[/dim]")
1188+
# Interactive provider selection for LLM-as-judge
1189+
result = get_or_select_provider(console)
1190+
if result is None:
11861191
return
11871192

1193+
selected_provider, selected_api_key = result
1194+
1195+
# Save preference for future runs
1196+
save_provider_preference(selected_provider)
1197+
1198+
# Set environment variable for the evaluators to use
1199+
config_for_provider = PROVIDER_CONFIGS[selected_provider]
1200+
os.environ["EVAL_PROVIDER"] = selected_provider.value
1201+
os.environ[config_for_provider.env_var] = selected_api_key
1202+
11881203
if debug:
11891204
console.print("[dim]🐛 Debug mode enabled - will show raw responses[/dim]\n")
11901205
verbose = True # Debug implies verbose
@@ -1229,14 +1244,17 @@ async def _run_async(
12291244
with open(config_path) as f:
12301245
config = yaml.safe_load(f)
12311246

1232-
# Extract model config
1247+
# Extract model config (can be string or dict)
12331248
model_config = config.get("model", {})
12341249
if verbose and model_config:
1235-
console.print(f"[dim]💰 Model: {model_config.get('name', 'gpt-5-mini')}[/dim]")
1236-
if "pricing" in model_config:
1237-
console.print(
1238-
f"[dim]💵 Custom pricing: ${model_config['pricing']['input_per_1m']:.2f} in, ${model_config['pricing']['output_per_1m']:.2f} out[/dim]"
1239-
)
1250+
if isinstance(model_config, str):
1251+
console.print(f"[dim]💰 Model: {model_config}[/dim]")
1252+
elif isinstance(model_config, dict):
1253+
console.print(f"[dim]💰 Model: {model_config.get('name', 'gpt-5-mini')}[/dim]")
1254+
if "pricing" in model_config:
1255+
console.print(
1256+
f"[dim]💵 Custom pricing: ${model_config['pricing']['input_per_1m']:.2f} in, ${model_config['pricing']['output_per_1m']:.2f} out[/dim]"
1257+
)
12401258

12411259
# SSRF protection config - defaults to True for local development
12421260
# Set to False in production when using untrusted test cases
@@ -1288,6 +1306,17 @@ async def _run_async(
12881306
model_config=model_config,
12891307
allow_private_urls=allow_private_urls,
12901308
)
1309+
elif adapter_type == "anthropic":
1310+
# Anthropic Claude adapter for direct API testing
1311+
from evalview.adapters.anthropic_adapter import AnthropicAdapter
1312+
adapter = AnthropicAdapter(
1313+
model=config.get("model", "claude-sonnet-4-5-20250929"),
1314+
tools=config.get("tools", []),
1315+
system_prompt=config.get("system_prompt"),
1316+
max_tokens=config.get("max_tokens", 4096),
1317+
timeout=config.get("timeout", 120.0),
1318+
verbose=verbose,
1319+
)
12911320
else:
12921321
# HTTP adapter for standard REST APIs
12931322
adapter = HTTPAdapter(
@@ -1298,19 +1327,8 @@ async def _run_async(
12981327
allow_private_urls=allow_private_urls,
12991328
)
13001329

1301-
# Validate OPENAI_API_KEY is set (required for LLM-as-judge evaluation)
1302-
openai_api_key = os.getenv("OPENAI_API_KEY")
1303-
if not openai_api_key:
1304-
console.print("\n[red bold]❌ Error: OPENAI_API_KEY is required for evaluation[/red bold]\n")
1305-
console.print("EvalView uses LLM-as-judge to evaluate agent output quality.")
1306-
console.print("Please set your OpenAI API key:\n")
1307-
console.print(" [cyan]export OPENAI_API_KEY='your-key-here'[/cyan]")
1308-
console.print("\nOr add it to your .env file:")
1309-
console.print(" [cyan]echo 'OPENAI_API_KEY=your-key-here' >> .env[/cyan]\n")
1310-
console.print("[dim]Get your API key at: https://platform.openai.com/api-keys[/dim]")
1311-
return
1312-
13131330
# Initialize evaluator with configurable weights
1331+
# (LLM provider is auto-detected by the OutputEvaluator)
13141332
scoring_weights = None
13151333
if "scoring" in config and "weights" in config["scoring"]:
13161334
try:
@@ -1321,7 +1339,6 @@ async def _run_async(
13211339
console.print(f"[yellow]⚠️ Invalid scoring weights in config: {e}. Using defaults.[/yellow]")
13221340

13231341
evaluator = Evaluator(
1324-
openai_api_key=openai_api_key,
13251342
default_weights=scoring_weights,
13261343
)
13271344

0 commit comments

Comments
 (0)