Skip to content

Commit ae4ac86

Browse files
committed
Add --summary and --coverage flags
1 parent 23c13cf commit ae4ac86

File tree

4 files changed

+468
-4
lines changed

4 files changed

+468
-4
lines changed

README.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,61 @@ Database config is optional – EvalView only uses it if you enable it in config
130130

131131
---
132132

133+
## Behavior Coverage (not line coverage)
134+
135+
Line coverage doesn't work for LLMs. Instead, EvalView focuses on **behavior coverage**:
136+
137+
| Dimension | What it measures |
138+
|-----------|------------------|
139+
| **Tasks covered** | Which real-world scenarios have tests? |
140+
| **Tools exercised** | Are all your agent's tools being tested? |
141+
| **Paths hit** | Are multi-step workflows tested end-to-end? |
142+
| **Eval dimensions** | Are you checking correctness, safety, cost, latency? |
143+
144+
**The loop:** weird prod session → turn it into a regression test → it shows up in your coverage.
145+
146+
```bash
147+
# Compact summary for screenshots / sharing
148+
evalview run --summary
149+
```
150+
151+
```
152+
━━━ EvalView Summary ━━━
153+
Suite: analytics_agent
154+
Tests: 7 passed, 2 failed
155+
156+
Failures:
157+
✗ cohort: large result set cost +240%
158+
✗ doc QA: long context missing tool: chunking
159+
160+
Deltas vs last run:
161+
Tokens: +188% ↑
162+
Latency: +95ms ↑
163+
Cost: +$0.12 ↑
164+
165+
⚠️ Regressions detected
166+
```
167+
168+
```bash
169+
# Behavior coverage report
170+
evalview run --coverage
171+
```
172+
173+
```
174+
━━━ Behavior Coverage ━━━
175+
Suite: analytics_agent
176+
177+
Tasks: 9/9 scenarios (100%)
178+
Tools: 6/8 exercised (75%)
179+
missing: chunking, summarize
180+
Paths: 3/3 multi-step workflows (100%)
181+
Dimensions: correctness ✓, output ✓, cost ✗, latency ✓, safety ✓
182+
183+
Overall: 92% behavior coverage
184+
```
185+
186+
---
187+
133188
## What it does (in practice)
134189

135190
- **Write test cases in YAML** – Define inputs, required tools, and scoring thresholds
@@ -294,6 +349,8 @@ Options:
294349
--max-retries N Retry flaky tests N times (default: 0)
295350
--watch Re-run tests on file changes
296351
--html-report PATH Generate interactive HTML report
352+
--summary Compact, screenshot-friendly output
353+
--coverage Show behavior coverage report
297354
```
298355
299356
### `evalview expand`

evalview/cli.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,6 +1144,16 @@ async def _init_wizard_async(dir: str):
11441144
type=click.Path(),
11451145
help="Generate HTML report to specified path",
11461146
)
1147+
@click.option(
1148+
"--summary",
1149+
is_flag=True,
1150+
help="Compact, screenshot-friendly output (great for sharing)",
1151+
)
1152+
@click.option(
1153+
"--coverage",
1154+
is_flag=True,
1155+
help="Show behavior coverage report (tasks, tools, paths, eval dimensions)",
1156+
)
11471157
def run(
11481158
path: Optional[str],
11491159
pattern: str,
@@ -1160,6 +1170,8 @@ def run(
11601170
retry_delay: float,
11611171
watch: bool,
11621172
html_report: str,
1173+
summary: bool,
1174+
coverage: bool,
11631175
):
11641176
"""Run test cases against the agent.
11651177
@@ -1168,7 +1180,7 @@ def run(
11681180
"""
11691181
asyncio.run(_run_async(
11701182
path, pattern, test, filter, output, verbose, track, compare_baseline, debug,
1171-
sequential, max_workers, max_retries, retry_delay, watch, html_report
1183+
sequential, max_workers, max_retries, retry_delay, watch, html_report, summary, coverage
11721184
))
11731185

11741186

@@ -1188,6 +1200,8 @@ async def _run_async(
11881200
retry_delay: float = 1.0,
11891201
watch: bool = False,
11901202
html_report: str = None,
1203+
summary: bool = False,
1204+
coverage: bool = False,
11911205
):
11921206
"""Async implementation of run command."""
11931207
import fnmatch
@@ -1901,7 +1915,29 @@ async def update_display():
19011915
# Print summary
19021916
console.print()
19031917
reporter = ConsoleReporter()
1904-
reporter.print_summary(results)
1918+
if summary:
1919+
# Compact, screenshot-friendly output
1920+
# Get suite name from path
1921+
suite_name = None
1922+
if path:
1923+
suite_name = Path(path).name if Path(path).is_dir() else Path(path).stem
1924+
1925+
# Load previous results for delta comparison
1926+
previous_results = None
1927+
output_dir = Path(output)
1928+
if output_dir.exists():
1929+
previous_results = JSONReporter.get_latest_results(output_dir)
1930+
1931+
reporter.print_compact_summary(results, suite_name=suite_name, previous_results=previous_results)
1932+
else:
1933+
reporter.print_summary(results)
1934+
1935+
# Print behavior coverage report if enabled
1936+
if coverage:
1937+
suite_name = None
1938+
if path:
1939+
suite_name = Path(path).name if Path(path).is_dir() else Path(path).stem
1940+
reporter.print_coverage_report(test_cases, results, suite_name=suite_name)
19051941

19061942
# Print regression analysis if enabled
19071943
if compare_baseline and regression_reports:

0 commit comments

Comments
 (0)