Skip to content

Commit bdaa687

Browse files
committed
readme changes and gitignore cleanup
1 parent dbc5c8d commit bdaa687

File tree

10 files changed

+114
-20
lines changed

10 files changed

+114
-20
lines changed

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,19 @@ env/
4545
.evalview/results/
4646
.evalview/reports/
4747
.evalview/config.yaml
48+
.evalview/golden/
49+
.evalview/history.jsonl
4850
tests/test-cases/*.yaml
4951
!tests/test-cases/example.yaml
5052

53+
# Temp / scratch
54+
tmp/
55+
code-review.md
56+
tests.yaml
57+
58+
# Registry tokens
59+
.mcpregistry_*
60+
5161
# Testing
5262
.pytest_cache/
5363
.coverage

README.md

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<img src="assets/logo.png" alt="EvalView" width="350">
66
<br>
77
<strong>Regression guardrails for agents.</strong><br>
8-
Generate tests, snapshot behavior, and catch silent regressions in CI before they hit production.
8+
Generate tests, snapshot tool use and multi-turn behavior, and catch silent regressions in CI before they hit production.
99
</p>
1010

1111
<p align="center">
@@ -36,6 +36,8 @@ Use EvalView when you need:
3636
- **golden baseline testing for agents**
3737
- **MCP server regression testing**
3838

39+
Normal tests catch crashes. Tracing shows what happened after the fact. EvalView catches the harder class of failures: the agent still returns `200`, but it stops asking the clarification question, takes the wrong tool path on turn two, or silently changes output quality after a model or prompt update.
40+
3941
<p align="center">
4042
<img src="assets/hero.jpg" alt="EvalView — multi-turn execution trace with sequence diagram" width="860">
4143
<br>
@@ -55,6 +57,26 @@ EvalView sends test queries to your agent's API and records everything: which to
5557
Score: 85 → 55 Output similarity: 35%
5658
```
5759

60+
### Multi-turn regressions are first-class
61+
62+
Many real failures are not single-turn failures. They happen when an agent should clarify, remember context, or act on a follow-up.
63+
64+
```yaml
65+
name: refund-needs-order-number
66+
turns:
67+
- query: "I want a refund"
68+
expected:
69+
output:
70+
contains: ["order number"]
71+
- query: "Order 4812"
72+
expected:
73+
tools: ["lookup_order", "check_policy"]
74+
thresholds:
75+
min_score: 70
76+
```
77+
78+
If the agent stops asking for the order number, skips straight to the wrong action, or takes a different tool path on the follow-up turn, EvalView flags it.
79+
5880
**Four scoring layers, each one optional:**
5981
6082
| Layer | What it checks | Needs API key? | Cost |
@@ -75,24 +97,6 @@ The first two layers alone catch most regressions — fully offline, zero cost.
7597

7698
**Your data stays local.** Nothing is sent to EvalView servers — all processing happens on your machine.
7799

78-
### Multi-turn regressions are first-class
79-
80-
EvalView does not stop at single prompt/output checks. It can catch regressions where an agent skips a clarification question, asks the wrong follow-up, or takes the wrong tool path on turn two.
81-
82-
```yaml
83-
tests:
84-
- name: refund_flow_requires_clarification
85-
conversation:
86-
- user: "I want a refund"
87-
expected:
88-
assistant_contains: ["order number"]
89-
- user: "Order 4812"
90-
expected:
91-
tools_called: ["lookup_order", "check_policy"]
92-
```
93-
94-
That matters because many real agent failures happen after the first turn, when the agent has to remember context, ask a clarifying question, or decide whether to act.
95-
96100
### The workflow
97101

98102
```bash
@@ -120,6 +124,8 @@ evalview snapshot
120124
evalview check
121125
```
122126

127+
That starter flow can cover single-turn checks, clarification turns, and multi-turn follow-ups against the same baseline.
128+
123129
### Start Here
124130

125131
Choose the shortest path for your use case:

evalview/commands/check_cmd.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,14 @@ def check(test_path: str, test: str, json_output: bool, fail_on: str, strict: bo
221221
console.print(f"[red]❌ No test found with name: {test}[/red]\n")
222222
sys.exit(1)
223223

224+
test_metadata = {
225+
tc.name: {
226+
"is_multi_turn": bool(getattr(tc, "is_multi_turn", False)),
227+
"behavior_class": (tc.meta or {}).get("behavior_class"),
228+
}
229+
for tc in test_cases
230+
}
231+
224232
# Load config
225233
config = _load_config_if_exists()
226234

@@ -322,6 +330,7 @@ def check(test_path: str, test: str, json_output: bool, fail_on: str, strict: bo
322330
golden_traces=golden_traces,
323331
results=results,
324332
ai_root_causes=ai_root_causes,
333+
test_metadata=test_metadata,
325334
)
326335

327336
if execution_failures > 0 and not json_output:

evalview/commands/check_display.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def _display_check_results(
171171
golden_traces: Optional[Dict[str, "GoldenTrace"]] = None,
172172
results: Optional[List["EvaluationResult"]] = None,
173173
ai_root_causes: Optional[Dict[str, Any]] = None,
174+
test_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
174175
) -> None:
175176
"""Display check results in JSON or console format."""
176177
import json
@@ -325,6 +326,10 @@ def _display_check_results(
325326
score_part = f" [{score_color}]{sign}{diff.score_diff:.1f} pts[/{score_color}]"
326327

327328
console.print(f"{severity_icon}: {name}{score_part}")
329+
meta = (test_metadata or {}).get(name, {})
330+
if meta.get("is_multi_turn"):
331+
behavior_class = str(meta.get("behavior_class") or "multi_turn").replace("_", " ")
332+
console.print(f" [dim]Multi-turn path:[/dim] {behavior_class}")
328333

329334
golden_for_test = _goldens.get(name)
330335
result_for_test = result_by_name.get(name)

evalview/commands/generate_cmd.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
from pathlib import Path
5+
import yaml
56

67
import click
78

@@ -52,7 +53,16 @@ def _print_generated_test_preview(output_dir: Path, max_files: int = 2) -> None:
5253
console.print()
5354
console.print("[bold]Generated Test Preview[/bold]")
5455
for path in yaml_files[:max_files]:
56+
try:
57+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
58+
except Exception:
59+
data = {}
60+
meta = data.get("meta") or {}
61+
behavior = str(meta.get("behavior_class") or "unknown").replace("_", " ")
62+
turns = data.get("turns") or []
63+
turn_label = f"{len(turns)} turns" if turns else "single turn"
5564
console.print(f"[dim]{path}[/dim]")
65+
console.print(f"[dim]Behavior: {behavior} | {turn_label}[/dim]")
5666
console.print(path.read_text(encoding="utf-8").rstrip())
5767
console.print()
5868
if len(yaml_files) > max_files:

evalview/commands/init_cmd.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,16 @@ def _print_generated_test_preview(tests_dir: Path, max_files: int = 1) -> None:
242242
console.print()
243243
console.print("[bold]Generated Test Preview[/bold]")
244244
for path in yaml_files[:max_files]:
245+
try:
246+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
247+
except Exception:
248+
data = {}
249+
meta = data.get("meta") or {}
250+
behavior = str(meta.get("behavior_class") or "unknown").replace("_", " ")
251+
turns = data.get("turns") or []
252+
turn_label = f"{len(turns)} turns" if turns else "single turn"
245253
console.print(f"[dim]{path}[/dim]")
254+
console.print(f"[dim]Behavior: {behavior} | {turn_label}[/dim]")
246255
console.print(path.read_text(encoding="utf-8").rstrip())
247256
console.print()
248257
if len(yaml_files) > max_files:
@@ -792,6 +801,7 @@ def _init_standard(dir: str, interactive: bool) -> None:
792801
console.print(
793802
f"[dim] Coverage: tool paths={covered.get('tool_paths', 0)}, "
794803
f"direct answers={covered.get('direct_answers', 0)}, "
804+
f"clarifications={covered.get('clarifications', 0)}, "
795805
f"multi-turn={covered.get('multi_turn', 0)}[/dim]"
796806
)
797807
if n == 1:

evalview/test_generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ def _generate_test_name(self, query: str, tools: Sequence[str], behavior_class:
772772
if normalized_query == _CAPABILITY_PROMPT.lower():
773773
base = "Capability Overview"
774774
elif normalized_query == _SAFE_FOLLOW_UP.lower():
775-
base = "Clarification Follow Up"
775+
base = "Clarification Completion" if behavior_class == "multi_turn" else "Clarification Follow Up"
776776
else:
777777
words = re.findall(r"\b\w+\b", query)
778778
key_words = [

tests/test_check_pipeline.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,3 +332,44 @@ def test_warning_shown_when_key_missing(self, project, monkeypatch):
332332
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
333333
from evalview.core.semantic_diff import SemanticDiff
334334
assert SemanticDiff.is_available() is False
335+
336+
337+
def test_display_check_results_labels_multi_turn_regressions(capsys):
338+
"""Console check output should explicitly label failed multi-turn paths."""
339+
from evalview.commands.check_display import _display_check_results
340+
from evalview.core.diff import DiffStatus
341+
342+
diff = MagicMock()
343+
diff.overall_severity = DiffStatus.REGRESSION
344+
diff.score_diff = -20.0
345+
diff.tool_diffs = []
346+
diff.output_diff = None
347+
348+
analysis = {
349+
"all_passed": False,
350+
"has_regressions": True,
351+
"has_tools_changed": False,
352+
"has_output_changed": False,
353+
"execution_failures": 0,
354+
}
355+
state = MagicMock()
356+
state.current_streak = 0
357+
state.total_checks = 1
358+
359+
_display_check_results(
360+
[("refund-needs-order-number", diff)],
361+
analysis,
362+
state,
363+
False,
364+
False,
365+
test_metadata={
366+
"refund-needs-order-number": {
367+
"is_multi_turn": True,
368+
"behavior_class": "multi_turn",
369+
}
370+
},
371+
)
372+
373+
output = capsys.readouterr().out
374+
assert "Multi-turn path:" in output
375+
assert "multi turn" in output

tests/test_generate_cmd.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def test_generate_writes_clustered_draft_suite(monkeypatch, tmp_path):
143143
assert "generated.report.json" in result.output
144144
assert "HTML report" in result.output
145145
assert "Generated Test Preview" in result.output
146+
assert "Behavior:" in result.output
146147
assert "name:" in result.output
147148

148149
yaml_files = sorted(out_dir.glob("*.yaml"))

tests/test_init_cmd.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def _fake_generate(endpoint, out_dir):
4444
assert "tests/test-cases/" not in result.output
4545
assert "Only 2 distinct behavior path was discovered" not in result.output
4646
assert "Generated Test Preview" in result.output
47+
assert "Behavior:" in result.output
4748
state = (tmp_path / ".evalview" / "state.json").read_text(encoding="utf-8")
4849
assert "tests/generated-from-init" in state
4950

@@ -106,3 +107,4 @@ def test_init_explains_single_draft_as_single_behavior_path(monkeypatch, tmp_pat
106107
assert result.exit_code == 0, result.output
107108
assert "Only 1 distinct behavior path was discovered during the lighter init flow" in result.output
108109
assert "one representative draft test" in result.output
110+
assert "clarifications=0" in result.output

0 commit comments

Comments
 (0)