31
31
background : # f8f9fa ;
32
32
padding : 20px ;
33
33
border-radius : 6px ;
34
- margin-bottom : 30 px ;
34
+ margin-bottom : 20 px ;
35
35
border-left : 4px solid # 00684a ;
36
36
}
37
+ .header-info : nth-child (3 ) {
38
+ border-left-color : # 007bff ;
39
+ }
40
+ .header-info : nth-child (4 ) {
41
+ border-left-color : # 28a745 ;
42
+ }
37
43
.header-info h2 {
38
44
margin-top : 0 ;
45
+ margin-bottom : 15px ;
39
46
color : # 00684a ;
47
+ font-size : 1.2em ;
48
+ }
49
+ .header-info : nth-child (3 ) h2 {
50
+ color : # 007bff ;
51
+ }
52
+ .header-info : nth-child (4 ) h2 {
53
+ color : # 28a745 ;
40
54
}
41
55
.info-grid {
42
56
display : grid;
43
- grid-template-columns : repeat (auto-fit, minmax (250 px , 1fr ));
57
+ grid-template-columns : repeat (auto-fit, minmax (200 px , 1fr ));
44
58
gap : 15px ;
45
59
margin-top : 15px ;
46
60
}
158
172
.accuracy-perfect {
159
173
background-color : # d4edda ;
160
174
color : # 155724 ;
161
- padding : 4 px 8 px ;
162
- border-radius : 4 px ;
175
+ padding : 2 px 6 px ;
176
+ border-radius : 3 px ;
163
177
font-weight : bold;
164
178
}
165
179
.accuracy-good {
166
180
background-color : # fff3cd ;
167
181
color : # 856404 ;
168
- padding : 4 px 8 px ;
169
- border-radius : 4 px ;
182
+ padding : 2 px 6 px ;
183
+ border-radius : 3 px ;
170
184
font-weight : bold;
171
185
}
172
186
.accuracy-poor {
173
187
background-color : # f8d7da ;
174
188
color : # 721c24 ;
175
- padding : 4 px 8 px ;
176
- border-radius : 4 px ;
189
+ padding : 2 px 6 px ;
190
+ border-radius : 3 px ;
177
191
font-weight : bold;
178
192
}
179
193
.tool-call {
215
229
min-width : 80px ;
216
230
text-align : center;
217
231
}
232
+ .baseline-accuracy-cell {
233
+ width : 8% ;
234
+ min-width : 80px ;
235
+ text-align : center;
236
+ }
237
+ .accuracy-comparison {
238
+ background : # e9ecef ;
239
+ padding : 2px 6px ;
240
+ border-radius : 3px ;
241
+ font-weight : bold;
242
+ }
243
+ .accuracy-improved {
244
+ background : # d4edda ;
245
+ color : # 155724 ;
246
+ }
247
+ .accuracy-regressed {
248
+ background : # f8d7da ;
249
+ color : # 721c24 ;
250
+ }
251
+ .accuracy-same {
252
+ background : # e2e3e5 ;
253
+ color : # 495057 ;
254
+ }
218
255
.response-time-cell {
219
256
width : 10% ;
220
257
min-width : 100px ;
264
301
< div class ="container ">
265
302
< h1 > 📊 MongoDB MCP Server - Accuracy Test Summary</ h1 >
266
303
< div class ="header-info ">
267
- < h2 > Run Information & Summary </ h2 >
304
+ < h2 > 📊 Current Run Information </ h2 >
268
305
< div class ="info-grid ">
269
306
< div class ="info-item ">
270
307
< div class ="info-label "> Accuracy Run ID</ div >
271
308
< div class ="info-value "> {{accuracyRunId}}</ div >
272
309
</ div >
273
- < div class ="info-item ">
274
- < div class ="info-label "> Accuracy Run Status</ div >
275
- < div class ="info-value status-{{runStatus}} "> {{runStatusUpper}}</ div >
276
- </ div >
277
310
< div class ="info-item ">
278
311
< div class ="info-label "> Commit SHA</ div >
279
312
< div class ="info-value "> {{commitSHA}}</ div >
280
313
</ div >
281
314
< div class ="info-item ">
282
- < div class ="info-label "> Report Generated On</ div >
283
- < div class ="info-value "> {{reportGeneratedOn }}</ div >
315
+ < div class ="info-label "> Run Created On</ div >
316
+ < div class ="info-value "> {{createdOn }}</ div >
284
317
</ div >
285
318
< div class ="info-item ">
286
- < div class ="info-label "> Snapshots Captured On</ div >
287
- < div class ="info-value "> {{createdOn }}</ div >
319
+ < div class ="info-label "> Report Generated On</ div >
320
+ < div class ="info-value "> {{reportGeneratedOn }}</ div >
288
321
</ div >
322
+ </ div >
323
+ </ div >
324
+
325
+ < div class ="header-info ">
326
+ < h2 > 📈 Test Results Summary</ h2 >
327
+ < div class ="info-grid ">
289
328
< div class ="info-item ">
290
329
< div class ="info-label "> Total Prompts Evaluated</ div >
291
330
< div class ="info-value "> {{totalTests}}</ div >
@@ -298,6 +337,36 @@ <h2>Run Information & Summary</h2>
298
337
< div class ="info-label "> Evals with 0% Accuracy</ div >
299
338
< div class ="info-value "> {{testsWithZeroAccuracy}}</ div >
300
339
</ div >
340
+ < div class ="info-item ">
341
+ < div class ="info-label "> Average Accuracy</ div >
342
+ < div class ="info-value "> {{averageAccuracy}}</ div >
343
+ </ div >
344
+ </ div >
345
+ </ div >
346
+
347
+ < div class ="header-info ">
348
+ < h2 > 🔄 Baseline Comparison</ h2 >
349
+ < div class ="info-grid ">
350
+ < div class ="info-item ">
351
+ < div class ="info-label "> Baseline Accuracy Run ID</ div >
352
+ < div class ="info-value "> {{baselineAccuracyRunId}}</ div >
353
+ </ div >
354
+ < div class ="info-item ">
355
+ < div class ="info-label "> Baseline Commit SHA</ div >
356
+ < div class ="info-value "> {{baselineCommitSHA}}</ div >
357
+ </ div >
358
+ < div class ="info-item ">
359
+ < div class ="info-label "> Baseline Run Created On</ div >
360
+ < div class ="info-value "> {{baselineCreatedOn}}</ div >
361
+ </ div >
362
+ < div class ="info-item ">
363
+ < div class ="info-label "> Evals Improved vs Baseline</ div >
364
+ < div class ="info-value "> {{evalsImproved}}</ div >
365
+ </ div >
366
+ < div class ="info-item ">
367
+ < div class ="info-label "> Evals Regressed vs Baseline</ div >
368
+ < div class ="info-value "> {{evalsRegressed}}</ div >
369
+ </ div >
301
370
</ div >
302
371
</ div >
303
372
< table >
@@ -308,6 +377,7 @@ <h2>Run Information & Summary</h2>
308
377
< th > Expected Tool Calls</ th >
309
378
< th > LLM Tool Calls</ th >
310
379
< th > Accuracy</ th >
380
+ < th > Baseline Accuracy</ th >
311
381
< th > LLM Response Time (ms)</ th >
312
382
< th > Total Tokens Used</ th >
313
383
</ tr >
0 commit comments