Skip to content

Commit 6bd1d71

Browse files
committed
Report standard error instead of std dev
1. report standard error instead of std dev. 2. apply bessel's correction to std calculation. 3. use standard error for GTE variance.
1 parent b6e63c5 commit 6bd1d71

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

kaggle_environments/envs/werewolf/eval/metrics.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@
3333
from kaggle_environments.envs.werewolf.game.consts import Team
3434

3535

36-
def _mean_std(values: List[float]) -> Tuple[float, float]:
37-
"""Helper to calculate mean and standard deviation, handling empty lists."""
36+
def _mean_sem(values: List[float]) -> Tuple[float, float]:
37+
"""Helper to calculate mean and standard error of the mean, handling empty lists."""
3838
if not values:
3939
return 0.0, 0.0
40-
return np.mean(values), np.std(values)
40+
if len(values) < 2:
41+
return float(np.mean(values)), 0.0
42+
return float(np.mean(values)), float(np.std(values, ddof=1) / np.sqrt(len(values)))
4143

4244

4345
def calculate_elo_change(p1_elo, p2_elo, result, k=32):
@@ -328,22 +330,22 @@ def set_agent_name(self, name: str):
328330
self.openskill_rating = self.openskill_model.rating(name=name) if self.openskill_model else None
329331

330332
def get_win_rate(self) -> Tuple[float, float]:
331-
return _mean_std(self.wins)
333+
return _mean_sem(self.wins)
332334

333335
def get_win_rate_for_role(self, role: str) -> Tuple[float, float]:
334-
return _mean_std(self.wins_by_role.get(role, []))
336+
return _mean_sem(self.wins_by_role.get(role, []))
335337

336338
def get_irp(self) -> Tuple[float, float]:
337-
return _mean_std(self.irp_scores)
339+
return _mean_sem(self.irp_scores)
338340

339341
def get_vss(self) -> Tuple[float, float]:
340-
return _mean_std(self.vss_scores)
342+
return _mean_sem(self.vss_scores)
341343

342344
def get_ksr(self) -> Tuple[float, float]:
343-
return _mean_std(self.survival_scores)
345+
return _mean_sem(self.survival_scores)
344346

345347
def get_ksr_for_role(self, role: str) -> Tuple[float, float]:
346-
return _mean_std(self.survival_by_role.get(role, []))
348+
return _mean_sem(self.survival_by_role.get(role, []))
347349

348350

349351
class GameSetEvaluator:
@@ -539,7 +541,10 @@ def _bootstrap_solve(rnd, games, agents, tasks):
539541
scores = agent_scores[agent][task]
540542
if scores:
541543
mean_matrix[i, j] = np.mean(scores)
542-
stddev_matrix[i, j] = np.std(scores)
544+
if len(scores) > 1:
545+
stddev_matrix[i, j] = np.std(scores, ddof=1) / np.sqrt(len(scores))
546+
else:
547+
stddev_matrix[i, j] = 0.0
543548

544549
for j in range(mean_matrix.shape[1]): # Iterate over tasks (columns)
545550
# If all agents have the same score for a task, add noise to avoid ptp=0 error

0 commit comments

Comments
 (0)