readme changes and gitignore cleanup

hidai25 · hidai25 · commit bdaa687dd7db · 2026-03-14T22:04:12.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -45,9 +45,19 @@ env/
 .evalview/results/
 .evalview/reports/
 .evalview/config.yaml
+.evalview/golden/
+.evalview/history.jsonl
 tests/test-cases/*.yaml
 !tests/test-cases/example.yaml
 
+# Temp / scratch
+tmp/
+code-review.md
+tests.yaml
+
+# Registry tokens
+.mcpregistry_*
+
 # Testing
 .pytest_cache/
 .coverage
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
   <img src="assets/logo.png" alt="EvalView" width="350">
   <br>
   <strong>Regression guardrails for agents.</strong><br>
-  Generate tests, snapshot behavior, and catch silent regressions in CI before they hit production.
+  Generate tests, snapshot tool use and multi-turn behavior, and catch silent regressions in CI before they hit production.
 </p>
 
 <p align="center">
@@ -36,6 +36,8 @@ Use EvalView when you need:
 - **golden baseline testing for agents**
 - **MCP server regression testing**
 
+Normal tests catch crashes. Tracing shows what happened after the fact. EvalView catches the harder class of failures: the agent still returns `200`, but it stops asking the clarification question, takes the wrong tool path on turn two, or silently changes output quality after a model or prompt update.
+
 <p align="center">
   <img src="assets/hero.jpg" alt="EvalView — multi-turn execution trace with sequence diagram" width="860">
   <br>
@@ -55,6 +57,26 @@ EvalView sends test queries to your agent's API and records everything: which to
       Score: 85 → 55  Output similarity: 35%
 ```
 
+### Multi-turn regressions are first-class
+
+Many real failures are not single-turn failures. They happen when an agent should clarify, remember context, or act on a follow-up.
+
+```yaml
+name: refund-needs-order-number
+turns:
+  - query: "I want a refund"
+    expected:
+      output:
+        contains: ["order number"]
+  - query: "Order 4812"
+    expected:
+      tools: ["lookup_order", "check_policy"]
+thresholds:
+  min_score: 70
+```
+
+If the agent stops asking for the order number, skips straight to the wrong action, or takes a different tool path on the follow-up turn, EvalView flags it.
+
 **Four scoring layers, each one optional:**
 
 | Layer | What it checks | Needs API key? | Cost |
@@ -75,24 +97,6 @@ The first two layers alone catch most regressions — fully offline, zero cost.
 
 **Your data stays local.** Nothing is sent to EvalView servers — all processing happens on your machine.
 
-### Multi-turn regressions are first-class
-
-EvalView does not stop at single prompt/output checks. It can catch regressions where an agent skips a clarification question, asks the wrong follow-up, or takes the wrong tool path on turn two.
-
-```yaml
-tests:
-  - name: refund_flow_requires_clarification
-    conversation:
-      - user: "I want a refund"
-        expected:
-          assistant_contains: ["order number"]
-      - user: "Order 4812"
-        expected:
-          tools_called: ["lookup_order", "check_policy"]
-```
-
-That matters because many real agent failures happen after the first turn, when the agent has to remember context, ask a clarifying question, or decide whether to act.
-
 ### The workflow
 
 ```bash
@@ -120,6 +124,8 @@ evalview snapshot
 evalview check
 ```
 
+That starter flow can cover single-turn checks, clarification turns, and multi-turn follow-ups against the same baseline.
+
 ### Start Here
 
 Choose the shortest path for your use case:
diff --git a/evalview/commands/check_cmd.py b/evalview/commands/check_cmd.py
@@ -221,6 +221,14 @@ def check(test_path: str, test: str, json_output: bool, fail_on: str, strict: bo
             console.print(f"[red]❌ No test found with name: {test}[/red]\n")
             sys.exit(1)
 
+    test_metadata = {
+        tc.name: {
+            "is_multi_turn": bool(getattr(tc, "is_multi_turn", False)),
+            "behavior_class": (tc.meta or {}).get("behavior_class"),
+        }
+        for tc in test_cases
+    }
+
     # Load config
     config = _load_config_if_exists()
 
@@ -322,6 +330,7 @@ def check(test_path: str, test: str, json_output: bool, fail_on: str, strict: bo
         golden_traces=golden_traces,
         results=results,
         ai_root_causes=ai_root_causes,
+        test_metadata=test_metadata,
     )
 
     if execution_failures > 0 and not json_output:
diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py
@@ -171,6 +171,7 @@ def _display_check_results(
     golden_traces: Optional[Dict[str, "GoldenTrace"]] = None,
     results: Optional[List["EvaluationResult"]] = None,
     ai_root_causes: Optional[Dict[str, Any]] = None,
+    test_metadata: Optional[Dict[str, Dict[str, Any]]] = None,
 ) -> None:
     """Display check results in JSON or console format."""
     import json
@@ -325,6 +326,10 @@ def _display_check_results(
                         score_part = f"  [{score_color}]{sign}{diff.score_diff:.1f} pts[/{score_color}]"
 
                     console.print(f"{severity_icon}: {name}{score_part}")
+                    meta = (test_metadata or {}).get(name, {})
+                    if meta.get("is_multi_turn"):
+                        behavior_class = str(meta.get("behavior_class") or "multi_turn").replace("_", " ")
+                        console.print(f"    [dim]Multi-turn path:[/dim] {behavior_class}")
 
                     golden_for_test = _goldens.get(name)
                     result_for_test = result_by_name.get(name)
diff --git a/evalview/commands/generate_cmd.py b/evalview/commands/generate_cmd.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+import yaml
 
 import click
 
@@ -52,7 +53,16 @@ def _print_generated_test_preview(output_dir: Path, max_files: int = 2) -> None:
     console.print()
     console.print("[bold]Generated Test Preview[/bold]")
     for path in yaml_files[:max_files]:
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+        except Exception:
+            data = {}
+        meta = data.get("meta") or {}
+        behavior = str(meta.get("behavior_class") or "unknown").replace("_", " ")
+        turns = data.get("turns") or []
+        turn_label = f"{len(turns)} turns" if turns else "single turn"
         console.print(f"[dim]{path}[/dim]")
+        console.print(f"[dim]Behavior: {behavior} | {turn_label}[/dim]")
         console.print(path.read_text(encoding="utf-8").rstrip())
         console.print()
     if len(yaml_files) > max_files:
diff --git a/evalview/commands/init_cmd.py b/evalview/commands/init_cmd.py
@@ -242,7 +242,16 @@ def _print_generated_test_preview(tests_dir: Path, max_files: int = 1) -> None:
     console.print()
     console.print("[bold]Generated Test Preview[/bold]")
     for path in yaml_files[:max_files]:
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+        except Exception:
+            data = {}
+        meta = data.get("meta") or {}
+        behavior = str(meta.get("behavior_class") or "unknown").replace("_", " ")
+        turns = data.get("turns") or []
+        turn_label = f"{len(turns)} turns" if turns else "single turn"
         console.print(f"[dim]{path}[/dim]")
+        console.print(f"[dim]Behavior: {behavior} | {turn_label}[/dim]")
         console.print(path.read_text(encoding="utf-8").rstrip())
         console.print()
     if len(yaml_files) > max_files:
@@ -792,6 +801,7 @@ def _init_standard(dir: str, interactive: bool) -> None:
             console.print(
                 f"[dim]   Coverage: tool paths={covered.get('tool_paths', 0)}, "
                 f"direct answers={covered.get('direct_answers', 0)}, "
+                f"clarifications={covered.get('clarifications', 0)}, "
                 f"multi-turn={covered.get('multi_turn', 0)}[/dim]"
             )
             if n == 1:
diff --git a/evalview/test_generation.py b/evalview/test_generation.py
@@ -772,7 +772,7 @@ def _generate_test_name(self, query: str, tools: Sequence[str], behavior_class:
         if normalized_query == _CAPABILITY_PROMPT.lower():
             base = "Capability Overview"
         elif normalized_query == _SAFE_FOLLOW_UP.lower():
-            base = "Clarification Follow Up"
+            base = "Clarification Completion" if behavior_class == "multi_turn" else "Clarification Follow Up"
         else:
             words = re.findall(r"\b\w+\b", query)
             key_words = [
diff --git a/tests/test_check_pipeline.py b/tests/test_check_pipeline.py
@@ -332,3 +332,44 @@ def test_warning_shown_when_key_missing(self, project, monkeypatch):
         monkeypatch.delenv("OPENAI_API_KEY", raising=False)
         from evalview.core.semantic_diff import SemanticDiff
         assert SemanticDiff.is_available() is False
+
+
+def test_display_check_results_labels_multi_turn_regressions(capsys):
+    """Console check output should explicitly label failed multi-turn paths."""
+    from evalview.commands.check_display import _display_check_results
+    from evalview.core.diff import DiffStatus
+
+    diff = MagicMock()
+    diff.overall_severity = DiffStatus.REGRESSION
+    diff.score_diff = -20.0
+    diff.tool_diffs = []
+    diff.output_diff = None
+
+    analysis = {
+        "all_passed": False,
+        "has_regressions": True,
+        "has_tools_changed": False,
+        "has_output_changed": False,
+        "execution_failures": 0,
+    }
+    state = MagicMock()
+    state.current_streak = 0
+    state.total_checks = 1
+
+    _display_check_results(
+        [("refund-needs-order-number", diff)],
+        analysis,
+        state,
+        False,
+        False,
+        test_metadata={
+            "refund-needs-order-number": {
+                "is_multi_turn": True,
+                "behavior_class": "multi_turn",
+            }
+        },
+    )
+
+    output = capsys.readouterr().out
+    assert "Multi-turn path:" in output
+    assert "multi turn" in output
diff --git a/tests/test_generate_cmd.py b/tests/test_generate_cmd.py
@@ -143,6 +143,7 @@ def test_generate_writes_clustered_draft_suite(monkeypatch, tmp_path):
     assert "generated.report.json" in result.output
     assert "HTML report" in result.output
     assert "Generated Test Preview" in result.output
+    assert "Behavior:" in result.output
     assert "name:" in result.output
 
     yaml_files = sorted(out_dir.glob("*.yaml"))
diff --git a/tests/test_init_cmd.py b/tests/test_init_cmd.py
@@ -44,6 +44,7 @@ def _fake_generate(endpoint, out_dir):
     assert "tests/test-cases/" not in result.output
     assert "Only 2 distinct behavior path was discovered" not in result.output
     assert "Generated Test Preview" in result.output
+    assert "Behavior:" in result.output
     state = (tmp_path / ".evalview" / "state.json").read_text(encoding="utf-8")
     assert "tests/generated-from-init" in state
 
@@ -106,3 +107,4 @@ def test_init_explains_single_draft_as_single_behavior_path(monkeypatch, tmp_pat
     assert result.exit_code == 0, result.output
     assert "Only 1 distinct behavior path was discovered during the lighter init flow" in result.output
     assert "one representative draft test" in result.output
+    assert "clarifications=0" in result.output