|
33 | 33 | from kaggle_environments.envs.werewolf.game.consts import Team |
34 | 34 |
|
35 | 35 |
|
36 | | -def _mean_std(values: List[float]) -> Tuple[float, float]: |
37 | | - """Helper to calculate mean and standard deviation, handling empty lists.""" |
| 36 | +def _mean_sem(values: List[float]) -> Tuple[float, float]: |
| 37 | + """Helper to calculate mean and standard error of the mean, handling empty lists.""" |
38 | 38 | if not values: |
39 | 39 | return 0.0, 0.0 |
40 | | - return np.mean(values), np.std(values) |
| 40 | + if len(values) < 2: |
| 41 | + return float(np.mean(values)), 0.0 |
| 42 | + return float(np.mean(values)), float(np.std(values, ddof=1) / np.sqrt(len(values))) |
41 | 43 |
|
42 | 44 |
|
43 | 45 | def calculate_elo_change(p1_elo, p2_elo, result, k=32): |
@@ -328,22 +330,22 @@ def set_agent_name(self, name: str): |
328 | 330 | self.openskill_rating = self.openskill_model.rating(name=name) if self.openskill_model else None |
329 | 331 |
|
330 | 332 | def get_win_rate(self) -> Tuple[float, float]: |
331 | | - return _mean_std(self.wins) |
| 333 | + return _mean_sem(self.wins) |
332 | 334 |
|
333 | 335 | def get_win_rate_for_role(self, role: str) -> Tuple[float, float]: |
334 | | - return _mean_std(self.wins_by_role.get(role, [])) |
| 336 | + return _mean_sem(self.wins_by_role.get(role, [])) |
335 | 337 |
|
336 | 338 | def get_irp(self) -> Tuple[float, float]: |
337 | | - return _mean_std(self.irp_scores) |
| 339 | + return _mean_sem(self.irp_scores) |
338 | 340 |
|
339 | 341 | def get_vss(self) -> Tuple[float, float]: |
340 | | - return _mean_std(self.vss_scores) |
| 342 | + return _mean_sem(self.vss_scores) |
341 | 343 |
|
342 | 344 | def get_ksr(self) -> Tuple[float, float]: |
343 | | - return _mean_std(self.survival_scores) |
| 345 | + return _mean_sem(self.survival_scores) |
344 | 346 |
|
345 | 347 | def get_ksr_for_role(self, role: str) -> Tuple[float, float]: |
346 | | - return _mean_std(self.survival_by_role.get(role, [])) |
| 348 | + return _mean_sem(self.survival_by_role.get(role, [])) |
347 | 349 |
|
348 | 350 |
|
349 | 351 | class GameSetEvaluator: |
@@ -539,7 +541,10 @@ def _bootstrap_solve(rnd, games, agents, tasks): |
539 | 541 | scores = agent_scores[agent][task] |
540 | 542 | if scores: |
541 | 543 | mean_matrix[i, j] = np.mean(scores) |
542 | | - stddev_matrix[i, j] = np.std(scores) |
| 544 | + if len(scores) > 1: |
| 545 | + stddev_matrix[i, j] = np.std(scores, ddof=1) / np.sqrt(len(scores)) |
| 546 | + else: |
| 547 | + stddev_matrix[i, j] = 0.0 |
543 | 548 |
|
544 | 549 | for j in range(mean_matrix.shape[1]): # Iterate over tasks (columns) |
545 | 550 | # If all agents have the same score for a task, add noise to avoid ptp=0 error |
|
0 commit comments