Add Backend column to results and improve LangGraph example

hidai25 · hidai25 · commit daa0ec937e0c · 2025-11-25T22:38:53.000+02:00
- Add adapter_name field to EvaluationResult to track which backend is
  used
  - Display Backend column in evaluation summary table
  - Fix evalview connect to detect LangGraph Cloud API via /ok and /info
  endpoints
  - Improve LangGraph example with screenshot, clear setup steps, and
  troubleshooting
diff --git a/evalview/cli.py b/evalview/cli.py
@@ -490,8 +490,9 @@ def get_adapter_for_test(test_case):
                 # Execute agent
                 trace = await test_adapter.execute(test_case.input.query, test_case.input.context)
 
-                # Evaluate
-                result = await evaluator.evaluate(test_case, trace)
+                # Evaluate (pass adapter name for display)
+                adapter_name = getattr(test_adapter, 'name', None)
+                result = await evaluator.evaluate(test_case, trace, adapter_name=adapter_name)
                 results.append(result)
 
                 # Track result and compare to baseline if enabled
@@ -749,52 +750,57 @@ async def _connect_async(endpoint: Optional[str]):
     # Common ports to check
     common_ports = [8000, 2024, 3000, 8080, 5000, 8888, 7860]
 
-    # Common endpoints to try (framework_type, name, path, adapter_type)
+    # Common endpoints to try (framework_type, name, path, adapter_type, method)
     # Will be combined with common_ports
     common_patterns = [
-        ("langgraph", "LangGraph", "/api/chat", "langgraph"),
-        ("langgraph", "LangGraph", "/invoke", "langgraph"),
-        ("langgraph", "LangGraph", "/threads/runs/stream", "langgraph"),  # LangGraph Cloud
-        ("http", "LangServe", "/agent", "http"),
-        ("streaming", "LangServe", "/agent/stream", "streaming"),
-        ("streaming", "TapeScope", "/api/unifiedchat", "streaming"),
-        ("crewai", "CrewAI", "/crew", "crewai"),
-        ("http", "FastAPI", "/api/agent", "http"),
-        ("http", "FastAPI", "/chat", "http"),
+        ("langgraph", "LangGraph Cloud", "/ok", "langgraph", "GET"),  # LangGraph Cloud health
+        ("langgraph", "LangGraph Cloud", "/info", "langgraph", "GET"),  # LangGraph Cloud info
+        ("langgraph", "LangGraph", "/api/chat", "langgraph", "POST"),
+        ("langgraph", "LangGraph", "/invoke", "langgraph", "POST"),
+        ("http", "LangServe", "/agent", "http", "POST"),
+        ("streaming", "LangServe", "/agent/stream", "streaming", "POST"),
+        ("streaming", "TapeScope", "/api/unifiedchat", "streaming", "POST"),
+        ("crewai", "CrewAI", "/crew", "crewai", "POST"),
+        ("http", "FastAPI", "/api/agent", "http", "POST"),
+        ("http", "FastAPI", "/chat", "http", "POST"),
     ]
 
     # Generate all port+path combinations
     common_endpoints = []
     for port in common_ports:
-        for framework, name, path, adapter in common_patterns:
+        for framework, name, path, adapter, method in common_patterns:
             url = f"http://127.0.0.1:{port}{path}"
-            common_endpoints.append((framework, f"{name} (:{port})", url, adapter))
+            common_endpoints.append((framework, f"{name} (:{port})", url, adapter, method))
 
     endpoints_to_test = []
     if endpoint:
         # User provided specific endpoint - try to detect adapter type
-        endpoints_to_test = [("http", "Custom", endpoint, "http")]
+        endpoints_to_test = [("http", "Custom", endpoint, "http", "POST")]
     else:
         # Try common ones
         endpoints_to_test = common_endpoints
 
     successful = None
 
     async with httpx.AsyncClient(timeout=5.0) as client:
-        for adapter_type, name, url, default_adapter in endpoints_to_test:
+        for adapter_type, name, url, default_adapter, method in endpoints_to_test:
             try:
                 console.print(f"[dim]Testing {name}: {url}...[/dim]", end=" ")
 
-                # Try a simple POST request
-                response = await client.post(
-                    url,
-                    json={
-                        "query": "test",
-                        "message": "test",
-                        "messages": [{"role": "user", "content": "test"}],
-                    },
-                    headers={"Content-Type": "application/json"},
-                )
+                # Use appropriate HTTP method
+                if method == "GET":
+                    response = await client.get(url)
+                else:
+                    # Try a simple POST request
+                    response = await client.post(
+                        url,
+                        json={
+                            "query": "test",
+                            "message": "test",
+                            "messages": [{"role": "user", "content": "test"}],
+                        },
+                        headers={"Content-Type": "application/json"},
+                    )
 
                 if response.status_code in [
                     200,
@@ -864,14 +870,18 @@ async def _connect_async(endpoint: Optional[str]):
 
             # Update config with detected adapter
             config["adapter"] = detected_adapter
-            config["endpoint"] = url
+            # For LangGraph Cloud, use base URL (strip /ok or /info)
+            endpoint_url = url
+            if detected_adapter == "langgraph" and (url.endswith("/ok") or url.endswith("/info")):
+                endpoint_url = url.rsplit("/", 1)[0]
+            config["endpoint"] = endpoint_url
 
             with open(config_path, "w") as f:
                 yaml.dump(config, f, default_flow_style=False, sort_keys=False)
 
             console.print("[green]✅ Updated config:[/green]")
             console.print(f"  • adapter: {detected_adapter}")
-            console.print(f"  • endpoint: {url}")
+            console.print(f"  • endpoint: {endpoint_url}")
             console.print()
             console.print("[blue]Next steps:[/blue]")
             console.print("  1. Create test cases in tests/test-cases/")
diff --git a/evalview/core/types.py b/evalview/core/types.py
@@ -256,6 +256,9 @@ class EvaluationResult(BaseModel):
     trace: ExecutionTrace
     timestamp: datetime
 
+    # Adapter info for dynamic display
+    adapter_name: Optional[str] = None  # e.g., "langgraph", "crewai", "tapescope"
+
     # User-facing fields for reports
     input_query: Optional[str] = None
     actual_output: Optional[str] = None
diff --git a/evalview/evaluators/evaluator.py b/evalview/evaluators/evaluator.py
@@ -35,13 +35,16 @@ def __init__(self, openai_api_key: Optional[str] = None):
         self.hallucination_evaluator = HallucinationEvaluator(openai_api_key)
         self.safety_evaluator = SafetyEvaluator(openai_api_key)
 
-    async def evaluate(self, test_case: TestCase, trace: ExecutionTrace) -> EvaluationResult:
+    async def evaluate(
+        self, test_case: TestCase, trace: ExecutionTrace, adapter_name: Optional[str] = None
+    ) -> EvaluationResult:
         """
         Run complete evaluation on a test case.
 
         Args:
             test_case: Test case with expected behavior
             trace: Execution trace from agent
+            adapter_name: Name of the adapter used (e.g., "langgraph", "crewai")
 
         Returns:
             Complete evaluation result
@@ -70,6 +73,7 @@ async def evaluate(self, test_case: TestCase, trace: ExecutionTrace) -> Evaluati
             evaluations=evaluations,
             trace=trace,
             timestamp=datetime.now(),
+            adapter_name=adapter_name,
             input_query=test_case.input.query,
             actual_output=trace.final_output,
         )
diff --git a/evalview/reporters/console_reporter.py b/evalview/reporters/console_reporter.py
@@ -147,6 +147,7 @@ def print_summary(self, results: List[EvaluationResult]) -> None:
         # Summary table
         table = Table(title="📊 Evaluation Summary", show_header=True)
         table.add_column("Test Case", style="cyan")
+        table.add_column("Backend", style="magenta")
         table.add_column("Score", justify="right")
         table.add_column("Status")
         table.add_column("Cost", justify="right")
@@ -168,8 +169,12 @@ def print_summary(self, results: List[EvaluationResult]) -> None:
             else:
                 tokens_str = "N/A"
 
+            # Get adapter name (capitalize for display)
+            adapter_display = (result.adapter_name or "unknown").capitalize()
+
             table.add_row(
                 result.test_case,
+                adapter_display,
                 f"[{score_color}]{result.score:.1f}[/{score_color}]",
                 status,
                 f"${result.trace.metrics.total_cost:.4f}",
diff --git a/examples/langgraph/README.md b/examples/langgraph/README.md
@@ -1,42 +1,141 @@
 # LangGraph Example
 
-Test a LangGraph research agent with EvalView.
+Test LangGraph agents with EvalView - capture tool calls, measure latency, cost, and output quality.
 
-## Setup
+## Example Output
 
-### 1. Clone LangGraph Examples
+![EvalView LangGraph Results](screenshot.png)
+
+<details>
+<summary>Text version</summary>
+
+```
+                               📊 Evaluation Summary
+┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┓
+┃ Test Case           ┃ Backend   ┃ Score ┃ Status    ┃    Cost ┃ Tokens ┃ Latency ┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━┩
+│ Conversational Test │ Langgraph │  80.0 │ ✅ PASSED │ $0.0014 │    321 │  6533ms │
+│ Search Test         │ Langgraph │  85.0 │ ✅ PASSED │ $0.0024 │    720 │  7244ms │
+│ Multi-Step Research │ Langgraph │  90.0 │ ✅ PASSED │ $0.0089 │  2,450 │ 12340ms │
+└─────────────────────┴───────────┴───────┴───────────┴─────────┴────────┴─────────┘
+
+Execution Flow (3 steps)
+├── Step 1: tavily_search [green]✓[/green]  [2100ms | $0.0020]
+│   └── → params: {"query": "AI agents 2024 trends"}
+├── Step 2: tavily_search [green]✓[/green]  [1800ms | $0.0020]
+│   └── → params: {"query": "LangGraph vs AutoGPT comparison"}
+└── Step 3: summarize [green]✓[/green]  [3200ms | $0.0049]
+    └── → params: {"content": "Based on the search results..."}
+```
+
+</details>
+
+## Quick Start
+
+### 1. Install Dependencies
 
 ```bash
-# Option A: LangGraph quickstart
-pip install langgraph langchain-openai
+# Python 3.11+ required
+pip install "langgraph-cli[inmem]" langchain-openai langchain-anthropic tavily-python
+```
 
-# Option B: Clone full examples repo
-git clone https://github.com/langchain-ai/langgraph.git
-cd langgraph/examples
+### 2. Set API Keys
+
+```bash
+export OPENAI_API_KEY=sk-...
+export TAVILY_API_KEY=tvly-...  # Get free key at tavily.com
 ```
 
-### 2. Start the Agent
+### 3. Start LangGraph Server
+
+**Option A: Use the included example agent**
 
 ```bash
-# Using LangGraph CLI
+cd examples/langgraph/agent
 langgraph dev
+```
 
-# Or run the example server
-cd langgraph/examples/chat_agent_executor
-python server.py
+**Option B: Use your own LangGraph agent**
+
+```bash
+cd /path/to/your/langgraph/project
+langgraph dev
 ```
 
-Agent will be available at: `http://localhost:8123`
+Server runs at: `http://localhost:2024`
 
-### 3. Run EvalView Test
+### 4. Run Tests
 
 ```bash
 # From EvalView root
-evalview run --pattern examples/langgraph/test-case.yaml
+evalview run --pattern examples/langgraph/
+```
+
+## Test Cases
+
+| Test | What it checks |
+|------|---------------|
+| `conversational.yaml` | Basic Q&A without tools |
+| `search.yaml` | Web search tool usage |
+| `multi-step.yaml` | Multi-tool research workflow |
+
+## Configuration
+
+EvalView auto-detects LangGraph Cloud API on port 2024. To configure manually:
+
+```yaml
+# .evalview/config.yaml
+adapter: langgraph
+endpoint: http://localhost:2024
+assistant_id: agent  # Your graph name from langgraph.json
+timeout: 90
 ```
 
+## Writing Test Cases
+
+```yaml
+name: "My Test"
+adapter: langgraph
+endpoint: http://localhost:2024
+
+input:
+  query: "What are the latest AI trends?"
+  context:
+    assistant_id: agent  # Optional: override default assistant
+
+expected:
+  tools:
+    - tavily_search  # Expected tools to be called
+  output:
+    contains:
+      - "AI"
+      - "trends"
+
+thresholds:
+  min_score: 70
+  max_cost: 0.10
+  max_latency: 30000
+```
+
+## Troubleshooting
+
+**"Python 3.11+ required"**
+```bash
+# Use conda or pyenv
+conda create -n langgraph python=3.12
+conda activate langgraph
+```
+
+**"TAVILY_API_KEY not found"**
+- Get a free key at [tavily.com](https://tavily.com)
+- Or modify the agent to remove the search tool
+
+**"Connection refused on port 2024"**
+- Make sure `langgraph dev` is running
+- Check for errors in the server terminal
+
 ## Links
 
-- **Repo**: https://github.com/langchain-ai/langgraph
-- **Quickstart**: https://langchain-ai.github.io/langgraph/tutorials/introduction/
-- **Examples**: https://github.com/langchain-ai/langgraph/tree/main/examples
+- [LangGraph Docs](https://langchain-ai.github.io/langgraph/)
+- [LangGraph GitHub](https://github.com/langchain-ai/langgraph)
+- [EvalView Docs](../../docs/)
diff --git a/examples/langgraph/screenshot.png b/examples/langgraph/screenshot.png
diff --git a/examples/langgraph/test-case.yaml b/examples/langgraph/test-case.yaml
@@ -6,7 +6,7 @@ description: "Test research agent's ability to search and synthesize information
 
 # Configure adapter for LangGraph
 adapter: langgraph
-endpoint: http://localhost:8123
+endpoint: http://localhost:2024
 
 input:
   query: "Research the latest developments in AI agents and summarize the key trends"