Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
eea0a37
add win rate, IRP, KSR, VSS, survival rate into key metrics
hannw Nov 16, 2025
c1d2e64
Add game theoretic eval into metrics
hannw Nov 17, 2025
6efbb25
change bootstrap solve to staticmethod
hannw Nov 17, 2025
e92ccfc
Add utilities to plot game theoretic eval
hannw Nov 18, 2025
c7e8dc6
add bar plot
hannw Nov 18, 2025
6e08e10
add elo and openskill evaluations
hannw Nov 18, 2025
77cd645
parallelize game loading
hannw Nov 18, 2025
b3a5e23
Report standard error instead of std dev
hannw Nov 19, 2025
32738bd
Plot the task marginal importance for GTE
hannw Nov 19, 2025
26d07c5
Add bootstrap for elo variance estimation
hannw Nov 19, 2025
2fc1070
Update the plots to use plotly
hannw Nov 21, 2025
4ea84a6
Add utilities to save multiple output paths and types for the figures
hannw Nov 21, 2025
8ee1722
parallelize bootstrap sampling
hannw Nov 21, 2025
c3f8ddf
Add bootstrap of openskill rating for its variance and parallelize op…
hannw Nov 21, 2025
2a36b32
Improve multiprocessing memory usage
hannw Nov 21, 2025
0f87160
Report 95% confidence interval instead of stderr
hannw Nov 21, 2025
3b60d5c
Use multithreading for bootstrap to save memory
hannw Nov 21, 2025
036723c
Use multiprocessing.Pool and initializer with global for bootstrap to…
hannw Nov 21, 2025
d48a51e
Use multiprocessing spawn to safely initialize multiprocessing
hannw Nov 21, 2025
431c619
Optimizing sample generation using iterator for bootstrap
hannw Nov 21, 2025
e069c9f
Sort elo and openskill plots
hannw Nov 21, 2025
9f680ab
Refactor GameResult to store only the raw data needed
hannw Nov 21, 2025
7d8c04b
Add dependencies for eval
hannw Nov 22, 2025
b0d567c
Fix plots to use consistent agent coloring
hannw Nov 22, 2025
6b4df9f
Use gte rating to sort default agent ordering in plots
hannw Nov 22, 2025
f75935b
Add win dependent metrics
hannw Nov 22, 2025
503dc77
Resolve jax deadlock issue
hannw Nov 22, 2025
fc277f4
Add pareto frontier plot
hannw Nov 22, 2025
bae2259
Add docstrings
hannw Nov 22, 2025
ffef1d6
Fix plotly google.colab compatibility issue
hannw Nov 22, 2025
c167d88
Add dominance metrics
hannw Nov 25, 2025
197e00e
Optimize the default list of tasks for game theoretic eval
hannw Nov 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions kaggle_environments/envs/werewolf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,28 @@ python kaggle_environments/envs/werewolf/scripts/self_play.py --litellm_model_pa
python kaggle_environments/envs/werewolf/scripts/dump_audio.py -o werewolf_replay_audio --debug-audio -r -s
# full llm game play and audio
python kaggle_environments/envs/werewolf/scripts/dump_audio.py --output_dir werewolf_replay_audio --shuffle_roles
```

## Running Evaluation
To run the evaluation scripts located in `kaggle_environments/envs/werewolf/eval/`, you'll need to install several additional dependencies. These are used for data manipulation, progress tracking, plotting, and advanced metrics calculation.

### Evaluation Dependencies
Install the following packages using pip:

```bash

pip install pandas tqdm plotly kaleido openskill.py polarix
```
For linux,
```bash
plotly_get_chrome
sudo apt update && sudo apt-get install libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2
```

### Example Usage
Once the dependencies are installed, you can run the metrics script on a directory of game replay JSONs:

```bash

python kaggle_environments/envs/werewolf/eval/metrics.py /path/to/your/replays
```
Empty file.
304 changes: 304 additions & 0 deletions kaggle_environments/envs/werewolf/eval/loaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
import json
import os
from collections import namedtuple
from concurrent.futures import ProcessPoolExecutor
from typing import Dict, List, Optional

from kaggle_environments.envs.werewolf.game.consts import Team
from kaggle_environments.utils import structify


def _load_json(file_path):
with open(file_path, 'r') as f:
try:
return json.load(f)
except json.JSONDecodeError:
print(f"Warning: Could not decode JSON from {file_path}")
return None


def get_games(input_dir: str) -> List[dict]:
"""Loads all game replay JSONs from a directory, walking subdirectories.

Args:
input_dir: The root directory to search for .json replay files.

Returns:
A list of dictionaries, each representing a loaded game replay.
"""
game_files = []
for root, _, files in os.walk(input_dir):
for file in files:
if file.endswith('.json'):
game_files.append(os.path.join(root, file))

with ProcessPoolExecutor() as executor:
games = list(executor.map(_load_json, game_files))

return [g for g in games if g is not None]


def _load_game_result(args):
file_path, preserve_full_record = args
game_json = _load_json(file_path)
if game_json is None:
return None
return GameResult(game_json, preserve_full_record=preserve_full_record)


def get_game_results(input_dir: str, preserve_full_record: bool = False,
max_workers: Optional[int] = None) -> List["GameResult"]:
"""Loads all game replays and returns GameResult objects, in parallel.

Args:
input_dir: The root directory to search for .json replay files.
preserve_full_record: If True, keeps the entire game JSON in memory
(useful for debugging but consumes significant RAM).
max_workers: The maximum number of worker processes to use.

Returns:
A list of GameResult objects.
"""
game_files = []
for root, _, files in os.walk(input_dir):
for file in files:
if file.endswith('.json'):
game_files.append(os.path.join(root, file))

args = [(f, preserve_full_record) for f in game_files]
with ProcessPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(_load_game_result, args))

return [r for r in results if r is not None]



ROLE_TO_TEAM = {
"VILLAGER": Team.VILLAGERS,
"WEREWOLF": Team.WEREWOLVES,
"SEER": Team.VILLAGERS,
"DOCTOR": Team.VILLAGERS,
}

Agent = namedtuple('Agent', ['display_name'])
Role = namedtuple('Role', ['name', 'team'])
Player = namedtuple('Player', ['id', 'agent', 'role', 'alive'])


class GameResult:
"""A memory-efficient representation of a game's outcome.

This class processes a raw game replay dictionary to extract only the
necessary information for evaluation, such as winners, player roles,
voting history, and costs.

Attributes:
winner_team (Team): The team that won the game.
players (List[Player]): A list of Player namedtuples.
villagers (Set[int]): A set of player IDs belonging to the Villager team.
wolves (Set[int]): A set of player IDs belonging to the Werewolf team.
id_to_agent (Dict[int, str]): A mapping from player ID to agent display name.
player_costs (Dict[int, float]): Mapping of player ID to total USD cost.
player_tokens (Dict[int, int]): Mapping of player ID to total tokens used.
irp_results (List[Tuple[str, int]]): Voting accuracy data for IRP metric.
vss_results (List[Tuple[str, int]]): Voting accuracy data for VSS metric.
player_durations (Dict[int, int]): Mapping of player ID to days survived.
"""

def __init__(self, game_json: Dict, preserve_full_record: bool = False):
"""Initializes the GameResult.

Args:
game_json: The raw dictionary of the game replay.
preserve_full_record: Whether to store the full `game_json` object.
"""
if preserve_full_record:
self.game_json = structify(game_json)
game_end_info = self.game_json.info.GAME_END
moderator_observation = self.game_json.info.MODERATOR_OBSERVATION
else:
self.game_json = None
# Extract only what's needed to avoid holding the whole dict in memory
info = game_json.get('info', {})
game_end_info_raw = info.get('GAME_END', {})
game_end_info = structify(game_end_info_raw)
moderator_observation = structify(info.get('MODERATOR_OBSERVATION', []))

self.winner_team: Team = Team(game_end_info.winner_team)
self.players = self._get_players(game_end_info)

# Derived attributes for convenience
self.villagers = {p.id for p in self.players if p.role.team == Team.VILLAGERS}
self.wolves = {p.id for p in self.players if p.role.team == Team.WEREWOLVES}
self.id_to_agent = {p.id: p.agent.display_name for p in self.players}

# Parse cost summary if available
# The cost summary structure is expected to be found in game_end_info.cost_summary
# Schema matches AgentCostSummary in werewolf.py
self.player_costs = {}
self.player_tokens = {}

cost_summary = getattr(game_end_info, 'cost_summary', None)

if cost_summary:
# cost_per_agent is a list of AgentCostSummary
cost_per_agent = getattr(cost_summary, 'cost_per_agent', []) or []

for agent_summ in cost_per_agent:
# AgentCostSummary has 'agent_config' (dict) and 'costs' (AgentCost)

# Extract Player ID from agent_config
agent_config = getattr(agent_summ, 'agent_config', None)
p_id = None
if agent_config:
p_id = getattr(agent_config, 'id', None)
if p_id is None and isinstance(agent_config, dict):
p_id = agent_config.get('id')

if p_id is not None:
costs = getattr(agent_summ, 'costs', None)
if costs:
# AgentCost has total_cost, prompt_tokens, completion_tokens
total_cost = getattr(costs, 'total_cost', 0.0)
prompt_tokens = getattr(costs, 'prompt_tokens', 0)
completion_tokens = getattr(costs, 'completion_tokens', 0)

self.player_costs[p_id] = total_cost
self.player_tokens[p_id] = prompt_tokens + completion_tokens

# Pre-compute voting results and discard moderator_observation
self.irp_results, self.vss_results, self.player_durations = self._precompute_voting_results(moderator_observation, game_end_info)

def __repr__(self) -> str:
player_lines = []
for player in sorted(self.players, key=lambda p: p.agent.display_name):
status = "W" if player.role.team == self.winner_team else "L"
elim_info = "" if player.alive else " (eliminated)"
cost_info = f", ${self.player_costs.get(player.id, 0.0):.4f}" if self.player_costs else ""
player_lines.append(f" - {player.agent.display_name} ({player.role.name}, {status}){elim_info}{cost_info}")

player_str = "\n".join(player_lines)
return (
f"<GameResult: {self.winner_team.value} won. {len(self.players)} players.\n"
f"{player_str}\n>"
)

def _get_players(self, game_end_info) -> List[Player]:
out = []
survivors = set(getattr(game_end_info, 'survivors_until_last_round_and_role', {}).keys())

for p_info in getattr(game_end_info, 'all_players', []):
role_name = p_info.agent.role
team = ROLE_TO_TEAM.get(role_name.upper())
if team is None:
print(f"Warning: Unknown role '{role_name}' found in game data.")

player = Player(
id=p_info.id,
agent=Agent(display_name=p_info.agent.display_name),
role=Role(name=role_name, team=team),
alive=p_info.id in survivors
)
out.append(player)
return out

def _precompute_voting_results(self, moderator_observation, game_end_info):
"""Extracts IRP, VSS scores and player durations from logs.

This method processes the log once and stores the results, allowing the
large observation object to be garbage collected.

Args:
moderator_observation: The raw event log from the moderator.
game_end_info: The game end summary object.

Returns:
A tuple containing (irp_results, vss_results, player_durations).
"""
day_vote_events = {}
werewolf_exile_events = {}

# Default duration is last_day for everyone.
last_day = getattr(game_end_info, 'last_day', 0)
player_durations = {p.id: last_day for p in self.players}

# Track eliminations to adjust duration
# If a player is eliminated, their duration is the day/step of elimination.
# We need to check for ELIMINATION events.

for step in moderator_observation:
for entry in step:
data_type = getattr(entry, 'data_type', None)
json_str = getattr(entry, 'json_str', "{}")

if data_type == "DayExileVoteDataEntry":
json_data = json.loads(json_str)
day = json_data["day"]
day_vote_events.setdefault(day, [])
day_vote_events[day].append(json_data["data"])
elif data_type == "DayExileElectedDataEntry":
json_data = json.loads(json_str)
if json_data["data"]['elected_player_id'] in self.wolves:
werewolf_exile_events[json_data["day"]] = json_data["data"]

# Update duration for exiled player
exiled_id = json_data["data"]['elected_player_id']
if exiled_id in player_durations:
player_durations[exiled_id] = json_data["day"]

elif data_type == "WerewolfNightEliminationElectedDataEntry":
# This entry implies a night elimination
pass

# Generic check for ELIMINATION event which engine.py logs
if getattr(entry, 'event_name', '') == 'ELIMINATION':
pass

# Use elimination_info from game_end_info if available for accurate durations
elimination_info = getattr(game_end_info, 'elimination_info', None)
if elimination_info:
# elimination_info is likely a dict of player_id -> {day, reason, etc}
# Check if it's a list or dict
if isinstance(elimination_info, list):
# Maybe list of elimination records
pass
elif isinstance(elimination_info, dict):
for p_id, info in elimination_info.items():
# info might be a struct or dict
day = getattr(info, 'day', None) or info.get('day')
if day is not None and p_id in player_durations:
player_durations[p_id] = day

irp_results = []
for day, entries in day_vote_events.items():
for entry in entries:
actor_id = entry['actor_id']
target_id = entry['target_id']
if actor_id in self.villagers:
agent_name = self.id_to_agent[actor_id]
score = 1 if target_id in self.wolves else 0
irp_results.append((agent_name, score))

vss_results = []
for day, item in werewolf_exile_events.items():
exiled_wolf_id = item['elected_player_id']
for entry in day_vote_events.get(day, []):
actor_id = entry['actor_id']
target_id = entry['target_id']
if actor_id in self.villagers:
agent_name = self.id_to_agent[actor_id]
score = 1 if target_id == exiled_wolf_id else 0
vss_results.append((agent_name, score))
return irp_results, vss_results, player_durations

def iterate_voting_mini_game(self):
"""Returns the pre-computed voting results.

Returns:
tuple: A tuple containing:
- irp_results (List[Tuple[str, int]]): (agent_name, score) for everyday vote cast by a villager.
Score is 1 if they voted for a werewolf, 0 otherwise.
- vss_results (List[Tuple[str, int]]): (agent_name, score) for villager votes on days a werewolf was exiled.
Score is 1 if they voted for the exiled werewolf, 0 otherwise.
"""
return self.irp_results, self.vss_results
Loading