Skip to content

Commit 7ef63bd

Browse files
committed
feat(report): show model and token usage in html reports
1 parent f329bbe commit 7ef63bd

File tree

2 files changed

+110
-0
lines changed

2 files changed

+110
-0
lines changed

evalview/reporters/html_reporter.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""HTML report generator with interactive Plotly charts."""
22

33
import json
4+
from collections import Counter
45
from datetime import datetime
56
from pathlib import Path
67
from typing import List, Dict, Any
@@ -76,13 +77,22 @@ def _compute_summary(self, results: List[EvaluationResult]) -> Dict[str, Any]:
7677
"total_cost": 0,
7778
"total_latency": 0,
7879
"avg_latency": 0,
80+
"total_tokens": 0,
81+
"models": [],
82+
"models_display": "Unknown",
7983
}
8084

8185
passed = sum(1 for r in results if r.passed)
8286
failed = len(results) - passed
8387
scores = [r.score for r in results]
8488
costs = [r.trace.metrics.total_cost for r in results]
8589
latencies = [r.trace.metrics.total_latency for r in results]
90+
total_tokens = sum(
91+
r.trace.metrics.total_tokens.total_tokens
92+
for r in results
93+
if r.trace.metrics.total_tokens
94+
)
95+
models = self._collect_models(results)
8696

8797
return {
8898
"total": len(results),
@@ -96,8 +106,37 @@ def _compute_summary(self, results: List[EvaluationResult]) -> Dict[str, Any]:
96106
"avg_cost": round(sum(costs) / len(costs), 4) if costs else 0,
97107
"total_latency": round(sum(latencies), 0),
98108
"avg_latency": round(sum(latencies) / len(latencies), 0) if latencies else 0,
109+
"total_tokens": total_tokens,
110+
"models": models,
111+
"models_display": ", ".join(models) if models else "Unknown",
99112
}
100113

114+
def _collect_models(self, results: List[EvaluationResult]) -> List[str]:
115+
"""Collect model fingerprints used across results."""
116+
counts: Counter[str] = Counter()
117+
for result in results:
118+
for model_name in self._extract_models(result):
119+
counts[model_name] += 1
120+
return [model for model, _ in counts.most_common()]
121+
122+
def _extract_models(self, result: EvaluationResult) -> List[str]:
123+
"""Extract best-effort model labels from a result."""
124+
models: list[str] = []
125+
trace = result.trace
126+
if trace.model_id:
127+
if trace.model_provider:
128+
models.append(f"{trace.model_provider}/{trace.model_id}")
129+
else:
130+
models.append(trace.model_id)
131+
if trace.trace_context:
132+
for span in trace.trace_context.spans:
133+
if span.llm and span.llm.model:
134+
provider = span.llm.provider or trace.model_provider
135+
label = f"{provider}/{span.llm.model}" if provider else span.llm.model
136+
models.append(label)
137+
# Preserve order while removing duplicates.
138+
return list(dict.fromkeys(models))
139+
101140
def _generate_charts(self, results: List[EvaluationResult]) -> Dict[str, str]:
102141
"""Generate Plotly charts as JSON strings."""
103142
if not results:
@@ -291,6 +330,9 @@ def _render_template(
291330
# Convert results to serializable format
292331
results_data = []
293332
for r in results:
333+
total_tokens = r.trace.metrics.total_tokens
334+
tokens_total = total_tokens.total_tokens if total_tokens else 0
335+
models = self._extract_models(r)
294336
results_data.append({
295337
"test_case": r.test_case,
296338
"passed": r.passed,
@@ -310,6 +352,11 @@ def _render_template(
310352
"latency": round(r.trace.metrics.total_latency, 0),
311353
"steps": len(r.trace.steps),
312354
"adapter": r.adapter_name or "http",
355+
"model": ", ".join(models) if models else "Unknown",
356+
"tokens_total": tokens_total,
357+
"tokens_input": total_tokens.input_tokens if total_tokens else 0,
358+
"tokens_output": total_tokens.output_tokens if total_tokens else 0,
359+
"tokens_cached": total_tokens.cached_tokens if total_tokens else 0,
313360
# Forbidden tool violations (empty list = no violations or not configured)
314361
"forbidden_violations": (
315362
r.evaluations.forbidden_tools.violations
@@ -356,6 +403,9 @@ def _render_template(
356403
.stat-card { text-align: center; padding: 1.5rem; }
357404
.stat-value { font-size: 2.5rem; font-weight: 700; }
358405
.stat-label { color: #64748b; font-size: 0.875rem; text-transform: uppercase; }
406+
.meta-card { padding: 1rem 1.25rem; }
407+
.meta-key { color: #64748b; font-size: 0.75rem; text-transform: uppercase; margin-bottom: 0.25rem; }
408+
.meta-value { font-weight: 600; color: #0f172a; }
359409
.pass { color: var(--pass-color); }
360410
.fail { color: var(--fail-color); }
361411
.badge-pass { background-color: var(--pass-color); }
@@ -531,6 +581,26 @@ def _render_template(
531581
</div>
532582
</div>
533583
</div>
584+
<div class="row mb-4">
585+
<div class="col-md-6">
586+
<div class="card meta-card">
587+
<div class="meta-key">Models Used</div>
588+
<div class="meta-value">{{ summary.models_display }}</div>
589+
</div>
590+
</div>
591+
<div class="col-md-3">
592+
<div class="card meta-card">
593+
<div class="meta-key">Total Tokens</div>
594+
<div class="meta-value">{{ "{:,}".format(summary.total_tokens) if summary.total_tokens else "0" }}</div>
595+
</div>
596+
</div>
597+
<div class="col-md-3">
598+
<div class="card meta-card">
599+
<div class="meta-key">Avg Latency</div>
600+
<div class="meta-value">{{ summary.avg_latency }}ms</div>
601+
</div>
602+
</div>
603+
</div>
534604
535605
{% if plotly_available and charts %}
536606
<!-- Charts -->
@@ -667,6 +737,21 @@ def _render_template(
667737
<td>Cost</td>
668738
<td>${{ result.cost }}</td>
669739
</tr>
740+
<tr>
741+
<td>Model</td>
742+
<td><strong>{{ result.model }}</strong></td>
743+
</tr>
744+
<tr>
745+
<td>Tokens</td>
746+
<td>
747+
<strong>{{ "{:,}".format(result.tokens_total) }}</strong>
748+
{% if result.tokens_total %}
749+
<div class="small text-muted">
750+
in {{ result.tokens_input }} / out {{ result.tokens_output }}{% if result.tokens_cached %} / cached {{ result.tokens_cached }}{% endif %}
751+
</div>
752+
{% endif %}
753+
</td>
754+
</tr>
670755
<tr>
671756
<td>Latency</td>
672757
<td>{{ result.latency }}ms</td>
@@ -713,6 +798,8 @@ def _render_template(
713798
{% set tool_spans = result.spans | selectattr("kind", "equalto", "tool") | list %}
714799
<div>LLM calls <span>{{ llm_spans | length }}</span></div>
715800
<div>Tool calls <span>{{ tool_spans | length }}</span></div>
801+
<div>Model <span>{{ result.model }}</span></div>
802+
<div>Tokens <span>{{ "{:,}".format(result.tokens_total) }}</span></div>
716803
<div>Total cost <span>${{ result.cost }}</span></div>
717804
<div>Total latency <span>{{ result.latency }}ms</span></div>
718805
</div>

tests/test_forbidden_tools.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
StepTrace,
2121
StepMetrics,
2222
ExecutionMetrics,
23+
TokenUsage,
2324
ForbiddenToolEvaluation,
2425
TraceContext,
2526
Span,
@@ -488,3 +489,25 @@ def test_long_prompt_is_truncated(self):
488489
llm_spans = [s for s in spans if s["kind"] == "llm"]
489490
# Truncation limit is 600 chars + " …"
490491
assert len(llm_spans[0]["llm"]["prompt"]) <= 605
492+
493+
def test_report_includes_model_and_token_totals(self, tmp_path):
494+
from evalview.reporters.html_reporter import HTMLReporter
495+
496+
reporter = HTMLReporter()
497+
result = self._make_result_with_trace_context()
498+
result.trace.model_id = "claude-sonnet-4-6"
499+
result.trace.model_provider = "anthropic"
500+
result.trace.metrics.total_tokens = TokenUsage(
501+
input_tokens=123,
502+
output_tokens=45,
503+
cached_tokens=6,
504+
)
505+
506+
output_path = tmp_path / "report.html"
507+
reporter.generate([result], str(output_path))
508+
html = output_path.read_text(encoding="utf-8")
509+
510+
assert "anthropic/claude-sonnet-4-6" in html
511+
assert "Total Tokens" in html
512+
assert "174" in html
513+
assert "in 123 / out 45 / cached 6" in html

0 commit comments

Comments
 (0)