Skip to content

Commit 865dbfe

Browse files
chore: update report generation to include comparison with baseline as well
1 parent 6ccaa11 commit 865dbfe

File tree

4 files changed

+234
-31
lines changed

4 files changed

+234
-31
lines changed

.github/workflows/accuracy-tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ jobs:
2323
MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }}
2424
MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }}
2525
MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }}
26+
MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }}
2627
steps:
2728
- uses: GitHubSecurityLab/actions-permissions/monitor@v1
2829
- uses: actions/checkout@v4

resources/test-summary-template.html

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,30 @@
3131
background: #f8f9fa;
3232
padding: 20px;
3333
border-radius: 6px;
34-
margin-bottom: 30px;
34+
margin-bottom: 20px;
3535
border-left: 4px solid #00684a;
3636
}
37+
.header-info:nth-child(3) {
38+
border-left-color: #007bff;
39+
}
40+
.header-info:nth-child(4) {
41+
border-left-color: #28a745;
42+
}
3743
.header-info h2 {
3844
margin-top: 0;
45+
margin-bottom: 15px;
3946
color: #00684a;
47+
font-size: 1.2em;
48+
}
49+
.header-info:nth-child(3) h2 {
50+
color: #007bff;
51+
}
52+
.header-info:nth-child(4) h2 {
53+
color: #28a745;
4054
}
4155
.info-grid {
4256
display: grid;
43-
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
57+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
4458
gap: 15px;
4559
margin-top: 15px;
4660
}
@@ -158,22 +172,22 @@
158172
.accuracy-perfect {
159173
background-color: #d4edda;
160174
color: #155724;
161-
padding: 4px 8px;
162-
border-radius: 4px;
175+
padding: 2px 6px;
176+
border-radius: 3px;
163177
font-weight: bold;
164178
}
165179
.accuracy-good {
166180
background-color: #fff3cd;
167181
color: #856404;
168-
padding: 4px 8px;
169-
border-radius: 4px;
182+
padding: 2px 6px;
183+
border-radius: 3px;
170184
font-weight: bold;
171185
}
172186
.accuracy-poor {
173187
background-color: #f8d7da;
174188
color: #721c24;
175-
padding: 4px 8px;
176-
border-radius: 4px;
189+
padding: 2px 6px;
190+
border-radius: 3px;
177191
font-weight: bold;
178192
}
179193
.tool-call {
@@ -215,6 +229,29 @@
215229
min-width: 80px;
216230
text-align: center;
217231
}
232+
.baseline-accuracy-cell {
233+
width: 8%;
234+
min-width: 80px;
235+
text-align: center;
236+
}
237+
.accuracy-comparison {
238+
background: #e9ecef;
239+
padding: 2px 6px;
240+
border-radius: 3px;
241+
font-weight: bold;
242+
}
243+
.accuracy-improved {
244+
background: #d4edda;
245+
color: #155724;
246+
}
247+
.accuracy-regressed {
248+
background: #f8d7da;
249+
color: #721c24;
250+
}
251+
.accuracy-same {
252+
background: #e2e3e5;
253+
color: #495057;
254+
}
218255
.response-time-cell {
219256
width: 10%;
220257
min-width: 100px;
@@ -264,28 +301,30 @@
264301
<div class="container">
265302
<h1>📊 MongoDB MCP Server - Accuracy Test Summary</h1>
266303
<div class="header-info">
267-
<h2>Run Information & Summary</h2>
304+
<h2>📊 Current Run Information</h2>
268305
<div class="info-grid">
269306
<div class="info-item">
270307
<div class="info-label">Accuracy Run ID</div>
271308
<div class="info-value">{{accuracyRunId}}</div>
272309
</div>
273-
<div class="info-item">
274-
<div class="info-label">Accuracy Run Status</div>
275-
<div class="info-value status-{{runStatus}}">{{runStatusUpper}}</div>
276-
</div>
277310
<div class="info-item">
278311
<div class="info-label">Commit SHA</div>
279312
<div class="info-value">{{commitSHA}}</div>
280313
</div>
281314
<div class="info-item">
282-
<div class="info-label">Report Generated On</div>
283-
<div class="info-value">{{reportGeneratedOn}}</div>
315+
<div class="info-label">Run Created On</div>
316+
<div class="info-value">{{createdOn}}</div>
284317
</div>
285318
<div class="info-item">
286-
<div class="info-label">Snapshots Captured On</div>
287-
<div class="info-value">{{createdOn}}</div>
319+
<div class="info-label">Report Generated On</div>
320+
<div class="info-value">{{reportGeneratedOn}}</div>
288321
</div>
322+
</div>
323+
</div>
324+
325+
<div class="header-info">
326+
<h2>📈 Test Results Summary</h2>
327+
<div class="info-grid">
289328
<div class="info-item">
290329
<div class="info-label">Total Prompts Evaluated</div>
291330
<div class="info-value">{{totalTests}}</div>
@@ -298,6 +337,36 @@ <h2>Run Information & Summary</h2>
298337
<div class="info-label">Evals with 0% Accuracy</div>
299338
<div class="info-value">{{testsWithZeroAccuracy}}</div>
300339
</div>
340+
<div class="info-item">
341+
<div class="info-label">Average Accuracy</div>
342+
<div class="info-value">{{averageAccuracy}}</div>
343+
</div>
344+
</div>
345+
</div>
346+
347+
<div class="header-info">
348+
<h2>🔄 Baseline Comparison</h2>
349+
<div class="info-grid">
350+
<div class="info-item">
351+
<div class="info-label">Baseline Accuracy Run ID</div>
352+
<div class="info-value">{{baselineAccuracyRunId}}</div>
353+
</div>
354+
<div class="info-item">
355+
<div class="info-label">Baseline Commit SHA</div>
356+
<div class="info-value">{{baselineCommitSHA}}</div>
357+
</div>
358+
<div class="info-item">
359+
<div class="info-label">Baseline Run Created On</div>
360+
<div class="info-value">{{baselineCreatedOn}}</div>
361+
</div>
362+
<div class="info-item">
363+
<div class="info-label">Evals Improved vs Baseline</div>
364+
<div class="info-value">{{evalsImproved}}</div>
365+
</div>
366+
<div class="info-item">
367+
<div class="info-label">Evals Regressed vs Baseline</div>
368+
<div class="info-value">{{evalsRegressed}}</div>
369+
</div>
301370
</div>
302371
</div>
303372
<table>
@@ -308,6 +377,7 @@ <h2>Run Information & Summary</h2>
308377
<th>Expected Tool Calls</th>
309378
<th>LLM Tool Calls</th>
310379
<th>Accuracy</th>
380+
<th>Baseline Accuracy</th>
311381
<th>LLM Response Time (ms)</th>
312382
<th>Total Tokens Used</th>
313383
</tr>

0 commit comments

Comments
 (0)