11"""HTML report generator with interactive Plotly charts."""
22
33import json
4+ from collections import Counter
45from datetime import datetime
56from pathlib import Path
67from typing import List , Dict , Any
@@ -76,13 +77,22 @@ def _compute_summary(self, results: List[EvaluationResult]) -> Dict[str, Any]:
7677 "total_cost" : 0 ,
7778 "total_latency" : 0 ,
7879 "avg_latency" : 0 ,
80+ "total_tokens" : 0 ,
81+ "models" : [],
82+ "models_display" : "Unknown" ,
7983 }
8084
8185 passed = sum (1 for r in results if r .passed )
8286 failed = len (results ) - passed
8387 scores = [r .score for r in results ]
8488 costs = [r .trace .metrics .total_cost for r in results ]
8589 latencies = [r .trace .metrics .total_latency for r in results ]
90+ total_tokens = sum (
91+ r .trace .metrics .total_tokens .total_tokens
92+ for r in results
93+ if r .trace .metrics .total_tokens
94+ )
95+ models = self ._collect_models (results )
8696
8797 return {
8898 "total" : len (results ),
@@ -96,8 +106,37 @@ def _compute_summary(self, results: List[EvaluationResult]) -> Dict[str, Any]:
96106 "avg_cost" : round (sum (costs ) / len (costs ), 4 ) if costs else 0 ,
97107 "total_latency" : round (sum (latencies ), 0 ),
98108 "avg_latency" : round (sum (latencies ) / len (latencies ), 0 ) if latencies else 0 ,
109+ "total_tokens" : total_tokens ,
110+ "models" : models ,
111+ "models_display" : ", " .join (models ) if models else "Unknown" ,
99112 }
100113
114+ def _collect_models (self , results : List [EvaluationResult ]) -> List [str ]:
115+ """Collect model fingerprints used across results."""
116+ counts : Counter [str ] = Counter ()
117+ for result in results :
118+ for model_name in self ._extract_models (result ):
119+ counts [model_name ] += 1
120+ return [model for model , _ in counts .most_common ()]
121+
122+ def _extract_models (self , result : EvaluationResult ) -> List [str ]:
123+ """Extract best-effort model labels from a result."""
124+ models : list [str ] = []
125+ trace = result .trace
126+ if trace .model_id :
127+ if trace .model_provider :
128+ models .append (f"{ trace .model_provider } /{ trace .model_id } " )
129+ else :
130+ models .append (trace .model_id )
131+ if trace .trace_context :
132+ for span in trace .trace_context .spans :
133+ if span .llm and span .llm .model :
134+ provider = span .llm .provider or trace .model_provider
135+ label = f"{ provider } /{ span .llm .model } " if provider else span .llm .model
136+ models .append (label )
137+ # Preserve order while removing duplicates.
138+ return list (dict .fromkeys (models ))
139+
101140 def _generate_charts (self , results : List [EvaluationResult ]) -> Dict [str , str ]:
102141 """Generate Plotly charts as JSON strings."""
103142 if not results :
@@ -291,6 +330,9 @@ def _render_template(
291330 # Convert results to serializable format
292331 results_data = []
293332 for r in results :
333+ total_tokens = r .trace .metrics .total_tokens
334+ tokens_total = total_tokens .total_tokens if total_tokens else 0
335+ models = self ._extract_models (r )
294336 results_data .append ({
295337 "test_case" : r .test_case ,
296338 "passed" : r .passed ,
@@ -310,6 +352,11 @@ def _render_template(
310352 "latency" : round (r .trace .metrics .total_latency , 0 ),
311353 "steps" : len (r .trace .steps ),
312354 "adapter" : r .adapter_name or "http" ,
355+ "model" : ", " .join (models ) if models else "Unknown" ,
356+ "tokens_total" : tokens_total ,
357+ "tokens_input" : total_tokens .input_tokens if total_tokens else 0 ,
358+ "tokens_output" : total_tokens .output_tokens if total_tokens else 0 ,
359+ "tokens_cached" : total_tokens .cached_tokens if total_tokens else 0 ,
313360 # Forbidden tool violations (empty list = no violations or not configured)
314361 "forbidden_violations" : (
315362 r .evaluations .forbidden_tools .violations
@@ -356,6 +403,9 @@ def _render_template(
356403 .stat-card { text-align: center; padding: 1.5rem; }
357404 .stat-value { font-size: 2.5rem; font-weight: 700; }
358405 .stat-label { color: #64748b; font-size: 0.875rem; text-transform: uppercase; }
406+ .meta-card { padding: 1rem 1.25rem; }
407+ .meta-key { color: #64748b; font-size: 0.75rem; text-transform: uppercase; margin-bottom: 0.25rem; }
408+ .meta-value { font-weight: 600; color: #0f172a; }
359409 .pass { color: var(--pass-color); }
360410 .fail { color: var(--fail-color); }
361411 .badge-pass { background-color: var(--pass-color); }
@@ -531,6 +581,26 @@ def _render_template(
531581 </div>
532582 </div>
533583 </div>
584+ <div class="row mb-4">
585+ <div class="col-md-6">
586+ <div class="card meta-card">
587+ <div class="meta-key">Models Used</div>
588+ <div class="meta-value">{{ summary.models_display }}</div>
589+ </div>
590+ </div>
591+ <div class="col-md-3">
592+ <div class="card meta-card">
593+ <div class="meta-key">Total Tokens</div>
594+ <div class="meta-value">{{ "{:,}".format(summary.total_tokens) if summary.total_tokens else "0" }}</div>
595+ </div>
596+ </div>
597+ <div class="col-md-3">
598+ <div class="card meta-card">
599+ <div class="meta-key">Avg Latency</div>
600+ <div class="meta-value">{{ summary.avg_latency }}ms</div>
601+ </div>
602+ </div>
603+ </div>
534604
535605 {% if plotly_available and charts %}
536606 <!-- Charts -->
@@ -667,6 +737,21 @@ def _render_template(
667737 <td>Cost</td>
668738 <td>${{ result.cost }}</td>
669739 </tr>
740+ <tr>
741+ <td>Model</td>
742+ <td><strong>{{ result.model }}</strong></td>
743+ </tr>
744+ <tr>
745+ <td>Tokens</td>
746+ <td>
747+ <strong>{{ "{:,}".format(result.tokens_total) }}</strong>
748+ {% if result.tokens_total %}
749+ <div class="small text-muted">
750+ in {{ result.tokens_input }} / out {{ result.tokens_output }}{% if result.tokens_cached %} / cached {{ result.tokens_cached }}{% endif %}
751+ </div>
752+ {% endif %}
753+ </td>
754+ </tr>
670755 <tr>
671756 <td>Latency</td>
672757 <td>{{ result.latency }}ms</td>
@@ -713,6 +798,8 @@ def _render_template(
713798 {% set tool_spans = result.spans | selectattr("kind", "equalto", "tool") | list %}
714799 <div>LLM calls <span>{{ llm_spans | length }}</span></div>
715800 <div>Tool calls <span>{{ tool_spans | length }}</span></div>
801+ <div>Model <span>{{ result.model }}</span></div>
802+ <div>Tokens <span>{{ "{:,}".format(result.tokens_total) }}</span></div>
716803 <div>Total cost <span>${{ result.cost }}</span></div>
717804 <div>Total latency <span>{{ result.latency }}ms</span></div>
718805 </div>
0 commit comments