Skip to content

Commit daa0ec9

Browse files
committed
Add Backend column to results and improve LangGraph example
- Add adapter_name field to EvaluationResult to track which backend is used - Display Backend column in evaluation summary table - Fix evalview connect to detect LangGraph Cloud API via /ok and /info endpoints - Improve LangGraph example with screenshot, clear setup steps, and troubleshooting
1 parent 93e67f2 commit daa0ec9

File tree

7 files changed

+170
-49
lines changed

7 files changed

+170
-49
lines changed

evalview/cli.py

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,9 @@ def get_adapter_for_test(test_case):
490490
# Execute agent
491491
trace = await test_adapter.execute(test_case.input.query, test_case.input.context)
492492

493-
# Evaluate
494-
result = await evaluator.evaluate(test_case, trace)
493+
# Evaluate (pass adapter name for display)
494+
adapter_name = getattr(test_adapter, 'name', None)
495+
result = await evaluator.evaluate(test_case, trace, adapter_name=adapter_name)
495496
results.append(result)
496497

497498
# Track result and compare to baseline if enabled
@@ -749,52 +750,57 @@ async def _connect_async(endpoint: Optional[str]):
749750
# Common ports to check
750751
common_ports = [8000, 2024, 3000, 8080, 5000, 8888, 7860]
751752

752-
# Common endpoints to try (framework_type, name, path, adapter_type)
753+
# Common endpoints to try (framework_type, name, path, adapter_type, method)
753754
# Will be combined with common_ports
754755
common_patterns = [
755-
("langgraph", "LangGraph", "/api/chat", "langgraph"),
756-
("langgraph", "LangGraph", "/invoke", "langgraph"),
757-
("langgraph", "LangGraph", "/threads/runs/stream", "langgraph"), # LangGraph Cloud
758-
("http", "LangServe", "/agent", "http"),
759-
("streaming", "LangServe", "/agent/stream", "streaming"),
760-
("streaming", "TapeScope", "/api/unifiedchat", "streaming"),
761-
("crewai", "CrewAI", "/crew", "crewai"),
762-
("http", "FastAPI", "/api/agent", "http"),
763-
("http", "FastAPI", "/chat", "http"),
756+
("langgraph", "LangGraph Cloud", "/ok", "langgraph", "GET"), # LangGraph Cloud health
757+
("langgraph", "LangGraph Cloud", "/info", "langgraph", "GET"), # LangGraph Cloud info
758+
("langgraph", "LangGraph", "/api/chat", "langgraph", "POST"),
759+
("langgraph", "LangGraph", "/invoke", "langgraph", "POST"),
760+
("http", "LangServe", "/agent", "http", "POST"),
761+
("streaming", "LangServe", "/agent/stream", "streaming", "POST"),
762+
("streaming", "TapeScope", "/api/unifiedchat", "streaming", "POST"),
763+
("crewai", "CrewAI", "/crew", "crewai", "POST"),
764+
("http", "FastAPI", "/api/agent", "http", "POST"),
765+
("http", "FastAPI", "/chat", "http", "POST"),
764766
]
765767

766768
# Generate all port+path combinations
767769
common_endpoints = []
768770
for port in common_ports:
769-
for framework, name, path, adapter in common_patterns:
771+
for framework, name, path, adapter, method in common_patterns:
770772
url = f"http://127.0.0.1:{port}{path}"
771-
common_endpoints.append((framework, f"{name} (:{port})", url, adapter))
773+
common_endpoints.append((framework, f"{name} (:{port})", url, adapter, method))
772774

773775
endpoints_to_test = []
774776
if endpoint:
775777
# User provided specific endpoint - try to detect adapter type
776-
endpoints_to_test = [("http", "Custom", endpoint, "http")]
778+
endpoints_to_test = [("http", "Custom", endpoint, "http", "POST")]
777779
else:
778780
# Try common ones
779781
endpoints_to_test = common_endpoints
780782

781783
successful = None
782784

783785
async with httpx.AsyncClient(timeout=5.0) as client:
784-
for adapter_type, name, url, default_adapter in endpoints_to_test:
786+
for adapter_type, name, url, default_adapter, method in endpoints_to_test:
785787
try:
786788
console.print(f"[dim]Testing {name}: {url}...[/dim]", end=" ")
787789

788-
# Try a simple POST request
789-
response = await client.post(
790-
url,
791-
json={
792-
"query": "test",
793-
"message": "test",
794-
"messages": [{"role": "user", "content": "test"}],
795-
},
796-
headers={"Content-Type": "application/json"},
797-
)
790+
# Use appropriate HTTP method
791+
if method == "GET":
792+
response = await client.get(url)
793+
else:
794+
# Try a simple POST request
795+
response = await client.post(
796+
url,
797+
json={
798+
"query": "test",
799+
"message": "test",
800+
"messages": [{"role": "user", "content": "test"}],
801+
},
802+
headers={"Content-Type": "application/json"},
803+
)
798804

799805
if response.status_code in [
800806
200,
@@ -864,14 +870,18 @@ async def _connect_async(endpoint: Optional[str]):
864870

865871
# Update config with detected adapter
866872
config["adapter"] = detected_adapter
867-
config["endpoint"] = url
873+
# For LangGraph Cloud, use base URL (strip /ok or /info)
874+
endpoint_url = url
875+
if detected_adapter == "langgraph" and (url.endswith("/ok") or url.endswith("/info")):
876+
endpoint_url = url.rsplit("/", 1)[0]
877+
config["endpoint"] = endpoint_url
868878

869879
with open(config_path, "w") as f:
870880
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
871881

872882
console.print("[green]✅ Updated config:[/green]")
873883
console.print(f" • adapter: {detected_adapter}")
874-
console.print(f" • endpoint: {url}")
884+
console.print(f" • endpoint: {endpoint_url}")
875885
console.print()
876886
console.print("[blue]Next steps:[/blue]")
877887
console.print(" 1. Create test cases in tests/test-cases/")

evalview/core/types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ class EvaluationResult(BaseModel):
256256
trace: ExecutionTrace
257257
timestamp: datetime
258258

259+
# Adapter info for dynamic display
260+
adapter_name: Optional[str] = None # e.g., "langgraph", "crewai", "tapescope"
261+
259262
# User-facing fields for reports
260263
input_query: Optional[str] = None
261264
actual_output: Optional[str] = None

evalview/evaluators/evaluator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,16 @@ def __init__(self, openai_api_key: Optional[str] = None):
3535
self.hallucination_evaluator = HallucinationEvaluator(openai_api_key)
3636
self.safety_evaluator = SafetyEvaluator(openai_api_key)
3737

38-
async def evaluate(self, test_case: TestCase, trace: ExecutionTrace) -> EvaluationResult:
38+
async def evaluate(
39+
self, test_case: TestCase, trace: ExecutionTrace, adapter_name: Optional[str] = None
40+
) -> EvaluationResult:
3941
"""
4042
Run complete evaluation on a test case.
4143
4244
Args:
4345
test_case: Test case with expected behavior
4446
trace: Execution trace from agent
47+
adapter_name: Name of the adapter used (e.g., "langgraph", "crewai")
4548
4649
Returns:
4750
Complete evaluation result
@@ -70,6 +73,7 @@ async def evaluate(self, test_case: TestCase, trace: ExecutionTrace) -> Evaluati
7073
evaluations=evaluations,
7174
trace=trace,
7275
timestamp=datetime.now(),
76+
adapter_name=adapter_name,
7377
input_query=test_case.input.query,
7478
actual_output=trace.final_output,
7579
)

evalview/reporters/console_reporter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def print_summary(self, results: List[EvaluationResult]) -> None:
147147
# Summary table
148148
table = Table(title="📊 Evaluation Summary", show_header=True)
149149
table.add_column("Test Case", style="cyan")
150+
table.add_column("Backend", style="magenta")
150151
table.add_column("Score", justify="right")
151152
table.add_column("Status")
152153
table.add_column("Cost", justify="right")
@@ -168,8 +169,12 @@ def print_summary(self, results: List[EvaluationResult]) -> None:
168169
else:
169170
tokens_str = "N/A"
170171

172+
# Get adapter name (capitalize for display)
173+
adapter_display = (result.adapter_name or "unknown").capitalize()
174+
171175
table.add_row(
172176
result.test_case,
177+
adapter_display,
173178
f"[{score_color}]{result.score:.1f}[/{score_color}]",
174179
status,
175180
f"${result.trace.metrics.total_cost:.4f}",

examples/langgraph/README.md

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,141 @@
11
# LangGraph Example
22

3-
Test a LangGraph research agent with EvalView.
3+
Test LangGraph agents with EvalView - capture tool calls, measure latency, cost, and output quality.
44

5-
## Setup
5+
## Example Output
66

7-
### 1. Clone LangGraph Examples
7+
![EvalView LangGraph Results](screenshot.png)
8+
9+
<details>
10+
<summary>Text version</summary>
11+
12+
```
13+
📊 Evaluation Summary
14+
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓
15+
┃ Test Case ┃ Backend ┃ Score ┃ Status ┃ Cost ┃ Tokens ┃ Latency ┃
16+
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩
17+
│ Conversational Test │ Langgraph │ 80.0 │ ✅ PASSED │ $0.0014 │ 321 │ 6533ms │
18+
│ Search Test │ Langgraph │ 85.0 │ ✅ PASSED │ $0.0024 │ 720 │ 7244ms │
19+
│ Multi-Step Research │ Langgraph │ 90.0 │ ✅ PASSED │ $0.0089 │ 2,450 │ 12340ms │
20+
└─────────────────────┴───────────┴───────┴───────────┴─────────┴────────┴─────────┘
21+
22+
Execution Flow (3 steps)
23+
├── Step 1: tavily_search [green]✓[/green] [2100ms | $0.0020]
24+
│ └── → params: {"query": "AI agents 2024 trends"}
25+
├── Step 2: tavily_search [green]✓[/green] [1800ms | $0.0020]
26+
│ └── → params: {"query": "LangGraph vs AutoGPT comparison"}
27+
└── Step 3: summarize [green]✓[/green] [3200ms | $0.0049]
28+
└── → params: {"content": "Based on the search results..."}
29+
```
30+
31+
</details>
32+
33+
## Quick Start
34+
35+
### 1. Install Dependencies
836

937
```bash
10-
# Option A: LangGraph quickstart
11-
pip install langgraph langchain-openai
38+
# Python 3.11+ required
39+
pip install "langgraph-cli[inmem]" langchain-openai langchain-anthropic tavily-python
40+
```
1241

13-
# Option B: Clone full examples repo
14-
git clone https://github.com/langchain-ai/langgraph.git
15-
cd langgraph/examples
42+
### 2. Set API Keys
43+
44+
```bash
45+
export OPENAI_API_KEY=sk-...
46+
export TAVILY_API_KEY=tvly-... # Get free key at tavily.com
1647
```
1748

18-
### 2. Start the Agent
49+
### 3. Start LangGraph Server
50+
51+
**Option A: Use the included example agent**
1952

2053
```bash
21-
# Using LangGraph CLI
54+
cd examples/langgraph/agent
2255
langgraph dev
56+
```
2357

24-
# Or run the example server
25-
cd langgraph/examples/chat_agent_executor
26-
python server.py
58+
**Option B: Use your own LangGraph agent**
59+
60+
```bash
61+
cd /path/to/your/langgraph/project
62+
langgraph dev
2763
```
2864

29-
Agent will be available at: `http://localhost:8123`
65+
Server runs at: `http://localhost:2024`
3066

31-
### 3. Run EvalView Test
67+
### 4. Run Tests
3268

3369
```bash
3470
# From EvalView root
35-
evalview run --pattern examples/langgraph/test-case.yaml
71+
evalview run --pattern examples/langgraph/
72+
```
73+
74+
## Test Cases
75+
76+
| Test | What it checks |
77+
|------|---------------|
78+
| `conversational.yaml` | Basic Q&A without tools |
79+
| `search.yaml` | Web search tool usage |
80+
| `multi-step.yaml` | Multi-tool research workflow |
81+
82+
## Configuration
83+
84+
EvalView auto-detects LangGraph Cloud API on port 2024. To configure manually:
85+
86+
```yaml
87+
# .evalview/config.yaml
88+
adapter: langgraph
89+
endpoint: http://localhost:2024
90+
assistant_id: agent # Your graph name from langgraph.json
91+
timeout: 90
3692
```
3793
94+
## Writing Test Cases
95+
96+
```yaml
97+
name: "My Test"
98+
adapter: langgraph
99+
endpoint: http://localhost:2024
100+
101+
input:
102+
query: "What are the latest AI trends?"
103+
context:
104+
assistant_id: agent # Optional: override default assistant
105+
106+
expected:
107+
tools:
108+
- tavily_search # Expected tools to be called
109+
output:
110+
contains:
111+
- "AI"
112+
- "trends"
113+
114+
thresholds:
115+
min_score: 70
116+
max_cost: 0.10
117+
max_latency: 30000
118+
```
119+
120+
## Troubleshooting
121+
122+
**"Python 3.11+ required"**
123+
```bash
124+
# Use conda or pyenv
125+
conda create -n langgraph python=3.12
126+
conda activate langgraph
127+
```
128+
129+
**"TAVILY_API_KEY not found"**
130+
- Get a free key at [tavily.com](https://tavily.com)
131+
- Or modify the agent to remove the search tool
132+
133+
**"Connection refused on port 2024"**
134+
- Make sure `langgraph dev` is running
135+
- Check for errors in the server terminal
136+
38137
## Links
39138

40-
- **Repo**: https://github.com/langchain-ai/langgraph
41-
- **Quickstart**: https://langchain-ai.github.io/langgraph/tutorials/introduction/
42-
- **Examples**: https://github.com/langchain-ai/langgraph/tree/main/examples
139+
- [LangGraph Docs](https://langchain-ai.github.io/langgraph/)
140+
- [LangGraph GitHub](https://github.com/langchain-ai/langgraph)
141+
- [EvalView Docs](../../docs/)

examples/langgraph/screenshot.png

295 KB
Loading

examples/langgraph/test-case.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ description: "Test research agent's ability to search and synthesize information
66

77
# Configure adapter for LangGraph
88
adapter: langgraph
9-
endpoint: http://localhost:8123
9+
endpoint: http://localhost:2024
1010

1111
input:
1212
query: "Research the latest developments in AI agents and summarize the key trends"

0 commit comments

Comments
 (0)