diff --git a/README.md b/README.md index b952019d3..630dc22f7 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ RLCard is a toolkit for Reinforcement Learning (RL) in card games. It supports m * Dou Dizhu Demo: [Demo](https://douzero.org/) * Resources: [Awesome-Game-AI](https://github.com/datamllab/awesome-game-ai) * Related Project: [DouZero Project](https://github.com/kwai/DouZero) +* Zhihu: https://zhuanlan.zhihu.com/p/526723604 **Community:** * **Slack**: Discuss in our [#rlcard-project](https://join.slack.com/t/rlcard/shared_invite/zt-rkvktsaq-xkMwz8BfKupCM6zGhO01xg) slack channel. @@ -27,6 +28,7 @@ RLCard is a toolkit for Reinforcement Learning (RL) in card games. It supports m * Group 2: 117349516 **News:** +* We have updated the tutorials in Jupyter Notebook to help you walk through RLCard! Please check [RLCard Tutorial](https://github.com/datamllab/rlcard-tutorial). * All the algorithms can suppport [PettingZoo](https://github.com/PettingZoo-Team/PettingZoo) now. Please check [here](examples/pettingzoo). Thanks the contribtuion from [Yifei Cheng](https://github.com/ycheng517). * Please follow [DouZero](https://github.com/kwai/DouZero), a strong Dou Dizhu AI and the [ICML 2021 paper](https://arxiv.org/abs/2106.06135). An online demo is available [here](https://douzero.org/). The algorithm is also integrated in RLCard. See [Training DMC on Dou Dizhu](docs/toy-examples.md#training-dmc-on-dou-dizhu). * Our package is used in [PettingZoo](https://github.com/PettingZoo-Team/PettingZoo). Please check it out! diff --git a/README.zh-CN.md b/README.zh-CN.md index 14f572981..b21a4c28e 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -19,6 +19,7 @@ RLCard是一款卡牌游戏强化学习 (Reinforcement Learning, RL) 的工具 * 斗地主演示:[Demo](https://douzero.org/) * 资源:[Awesome-Game-AI](https://github.com/datamllab/awesome-game-ai) * 相关项目:[DouZero项目](https://github.com/kwai/DouZero) +* 知乎:[https://zhuanlan.zhihu.com/p/526723604](https://zhuanlan.zhihu.com/p/526723604) **社区:** * **Slack**: 在我们的[#rlcard-project](https://join.slack.com/t/rlcard/shared_invite/zt-rkvktsaq-xkMwz8BfKupCM6zGhO01xg) slack频道参与讨论. @@ -27,6 +28,7 @@ RLCard是一款卡牌游戏强化学习 (Reinforcement Learning, RL) 的工具 * 二群:117349516 **新闻:** +* 我们更新Jupyter Notebook的教程帮助您快速了解RLCard!请看 [RLCard 教程](https://github.com/datamllab/rlcard-tutorial). * 所有的算法都已支持[PettingZoo](https://github.com/PettingZoo-Team/PettingZoo)接口. 请点击[这里](examples/pettingzoo). 感谢[Yifei Cheng](https://github.com/ycheng517)的贡献。 * 请关注[DouZero](https://github.com/kwai/DouZero), 一个强大的斗地主AI,以及[ICML 2021论文](https://arxiv.org/abs/2106.06135)。点击[此处](https://douzero.org/)进入在线演示。该算法同样集成到了RLCard中,详见[在斗地主中训练DMC](docs/toy-examples.md#training-dmc-on-dou-dizhu)。 * 我们的项目被用在[PettingZoo](https://github.com/PettingZoo-Team/PettingZoo)中,去看看吧! diff --git a/examples/human/nolimit_holdem_human.py b/examples/human/nolimit_holdem_human.py index 76f29da11..0afd2c213 100644 --- a/examples/human/nolimit_holdem_human.py +++ b/examples/human/nolimit_holdem_human.py @@ -6,46 +6,49 @@ from rlcard import models from rlcard.agents import NolimitholdemHumanAgent as HumanAgent from rlcard.utils import print_card - # Make environment env = rlcard.make('no-limit-holdem') human_agent = HumanAgent(env.num_actions) human_agent2 = HumanAgent(env.num_actions) +human_agent3 = HumanAgent(env.num_actions) +human_agent4 = HumanAgent(env.num_actions) # random_agent = RandomAgent(num_actions=env.num_actions) -env.set_agents([human_agent, human_agent2]) +env.set_agents([human_agent, human_agent2, human_agent3, human_agent4]) while (True): print(">> Start a new game") - - trajectories, payoffs = env.run(is_training=False) - # If the human does not take the final action, we need to - # print other players action - final_state = trajectories[0][-1] - action_record = final_state['action_record'] - state = final_state['raw_obs'] - _action_list = [] - for i in range(1, len(action_record)+1): - if action_record[-i][0] == state['current_player']: - break - _action_list.insert(0, action_record[-i]) - for pair in _action_list: - print('>> Player', pair[0], 'chooses', pair[1]) - - # Let's take a look at what the agent card is - print('=============== Cards all Players ===============') - for hands in env.get_perfect_information()['hand_cards']: - print_card(hands) - - print('=============== Result ===============') - if payoffs[0] > 0: - print('You win {} chips!'.format(payoffs[0])) - elif payoffs[0] == 0: - print('It is a tie.') - else: - print('You lose {} chips!'.format(-payoffs[0])) - print('') + while(True): + trajectories, payoffs = env.run(is_training=False) + # If the human does not take the final action, we need to + # print other players action + final_state = trajectories[0][-1] + action_record = final_state['action_record'] + state = final_state['raw_obs'] + _action_list = [] + for i in range(1, len(action_record)+1): + if action_record[-i][0] == state['current_player']: + break + _action_list.insert(0, action_record[-i]) + for pair in _action_list: + print('>> Player', pair[0], 'chooses', pair[1]) + + # Let's take a look at what the agent card is + print('=============== Cards all Players ===============') + for hands in env.get_perfect_information()['hand_cards']: + print_card(hands) + + print('=============== Result ===============') + print(payoffs) + # if payoffs[0] > 0: + # print('You win {} chips!'.format(payoffs[0])) + # elif payoffs[0] == 0: + # print('It is a tie.') + # else: + # print('You lose {} chips!'.format(-payoffs[0])) + # print('') + break input("Press any key to continue...") diff --git a/examples/run_rl.py b/examples/run_rl.py index 6bef00d2a..f4ad6a03d 100644 --- a/examples/run_rl.py +++ b/examples/run_rl.py @@ -125,7 +125,7 @@ def train(args): parser.add_argument( '--cuda', type=str, - default='', + default='0', ) parser.add_argument( '--seed', diff --git a/rlcard/agents/human_agents/nolimit_holdem_human_agent.py b/rlcard/agents/human_agents/nolimit_holdem_human_agent.py index 1d001b3f8..e0b982183 100644 --- a/rlcard/agents/human_agents/nolimit_holdem_human_agent.py +++ b/rlcard/agents/human_agents/nolimit_holdem_human_agent.py @@ -1,5 +1,5 @@ from rlcard.utils.utils import print_card - +from rlcard.games.nolimitholdem.round import Action class HumanAgent(object): ''' A human agent for No Limit Holdem. It can be used to play against trained models @@ -26,10 +26,16 @@ def step(state): ''' _print_state(state['raw_obs'], state['action_record']) action = int(input('>> You choose action (integer): ')) - while action < 0 or action >= len(state['legal_actions']): + amt = 0 + if state['raw_legal_actions'][action] == Action.RAISE: + amt = int(input('>> Choose your raise amount: ')) + while action < 0 or action >= len(state['legal_actions']) or\ + (state['raw_legal_actions'][action] == Action.RAISE and amt < state['last_raise'] * 2): print('Action illegel...') action = int(input('>> Re-choose action (integer): ')) - return state['raw_legal_actions'][action] + if state['raw_legal_actions'][action] == Action.RAISE: + amt = int(input('>> Choose your raise amount: ')) + return state['raw_legal_actions'][action], amt def eval_step(self, state): ''' Predict the action given the curent state for evaluation. The same to step here. @@ -70,4 +76,3 @@ def _print_state(state, action_record): print('\n=========== Actions You Can Choose ===========') print(', '.join([str(index) + ': ' + str(action) for index, action in enumerate(state['legal_actions'])])) print('') - print(state) diff --git a/rlcard/envs/env.py b/rlcard/envs/env.py index 93e239548..f736efc4b 100644 --- a/rlcard/envs/env.py +++ b/rlcard/envs/env.py @@ -163,6 +163,8 @@ def run(self, is_training=False): state = self.get_state(player_id) trajectories[player_id].append(state) + print(trajectories[0]) + # Payoffs payoffs = self.get_payoffs() diff --git a/rlcard/envs/leducholdem.py b/rlcard/envs/leducholdem.py index 280dee674..da389dd8e 100644 --- a/rlcard/envs/leducholdem.py +++ b/rlcard/envs/leducholdem.py @@ -61,7 +61,7 @@ def _extract_state(self, state): if public_card: obs[self.card2index[public_card]+3] = 1 obs[state['my_chips']+6] = 1 - obs[state['all_chips'][1]+20] = 1 + obs[sum(state['all_chips'])-state['my_chips']+21] = 1 extracted_state['obs'] = obs extracted_state['raw_obs'] = state diff --git a/rlcard/envs/nolimitholdem.py b/rlcard/envs/nolimitholdem.py index e7522cb44..811a941b5 100644 --- a/rlcard/envs/nolimitholdem.py +++ b/rlcard/envs/nolimitholdem.py @@ -9,7 +9,7 @@ from rlcard.games.nolimitholdem.round import Action DEFAULT_GAME_CONFIG = { - 'game_num_players': 2, + 'game_num_players': 4, 'chips_for_each': 100, 'dealer_id': None, } @@ -69,10 +69,10 @@ def _extract_state(self, state): obs[52] = float(my_chips) obs[53] = float(max(all_chips)) extracted_state['obs'] = obs - extracted_state['raw_obs'] = state extracted_state['raw_legal_actions'] = [a for a in state['legal_actions']] extracted_state['action_record'] = self.action_recorder + extracted_state['last_raise'] = state['last_raise'] return extracted_state @@ -114,6 +114,7 @@ def get_perfect_information(self): state['hand_cards'] = [[c.get_index() for c in self.game.players[i].hand] for i in range(self.num_players)] state['current_player'] = self.game.game_pointer state['legal_actions'] = self.game.get_legal_actions() + state['last_raise'] = self.game.round.last_raise return state diff --git a/rlcard/games/nolimitholdem/game.py b/rlcard/games/nolimitholdem/game.py index 86276fbe0..0df12b2d8 100644 --- a/rlcard/games/nolimitholdem/game.py +++ b/rlcard/games/nolimitholdem/game.py @@ -113,7 +113,7 @@ def get_legal_actions(self): """ return self.round.get_nolimit_legal_actions(players=self.players) - def step(self, action): + def step(self, action_tp): """ Get the next state @@ -126,7 +126,7 @@ def step(self, action): (dict): next player's state (int): next player id """ - + action, amt = action_tp if action not in self.get_legal_actions(): print(action, self.get_legal_actions()) print(self.get_state(self.game_pointer)) @@ -143,7 +143,7 @@ def step(self, action): self.history.append((r, b, r_c, d, p, ps)) # Then we proceed to the next round - self.game_pointer = self.round.proceed_round(self.players, action) + self.game_pointer = self.round.proceed_round(self.players, action_tp) players_in_bypass = [1 if player.status in (PlayerStatus.FOLDED, PlayerStatus.ALLIN) else 0 for player in self.players] if self.num_players - sum(players_in_bypass) == 1: @@ -206,6 +206,7 @@ def get_state(self, player_id): state['current_player'] = self.game_pointer state['pot'] = self.dealer.pot state['stage'] = self.stage + state['last_raise'] = self.round.last_raise return state def step_back(self): diff --git a/rlcard/games/nolimitholdem/round.py b/rlcard/games/nolimitholdem/round.py index 01d4cf875..9b0ff7e83 100644 --- a/rlcard/games/nolimitholdem/round.py +++ b/rlcard/games/nolimitholdem/round.py @@ -8,12 +8,13 @@ class Action(Enum): FOLD = 0 CHECK_CALL = 1 - #CALL = 2 + RAISE = 2 + # CALL = 2 # RAISE_3BB = 3 - RAISE_HALF_POT = 2 - RAISE_POT = 3 - # RAISE_2POT = 5 - ALL_IN = 4 + # RAISE_HALF_POT = 2 + # RAISE_POT = 3 + # # RAISE_2POT = 5 + # ALL_IN = 4 # SMALL_BLIND = 7 # BIG_BLIND = 8 @@ -45,6 +46,7 @@ def __init__(self, num_players, init_raise_amount, dealer, np_random): # Raised amount for each player self.raised = [0 for _ in range(self.num_players)] + self.last_raise = 0 def start_new_round(self, game_pointer, raised=None): """ @@ -58,12 +60,13 @@ def start_new_round(self, game_pointer, raised=None): """ self.game_pointer = game_pointer self.not_raise_num = 0 + self.last_raise = 0 if raised: self.raised = raised else: self.raised = [0 for _ in range(self.num_players)] - def proceed_round(self, players, action): + def proceed_round(self, players, action_tp): """ Call functions from other classes to keep one round running @@ -76,29 +79,38 @@ def proceed_round(self, players, action): """ player = players[self.game_pointer] + action = action_tp[0] if action == Action.CHECK_CALL: diff = max(self.raised) - self.raised[self.game_pointer] self.raised[self.game_pointer] = max(self.raised) player.bet(chips=diff) self.not_raise_num += 1 + + elif action == Action.RAISE: + raise_amt = action_tp[1] + self.last_raise = raise_amt + self.raised[self.game_pointer] += raise_amt + player.bet(chips=raise_amt) + self.not_raise_num = 1 - elif action == Action.ALL_IN: - all_in_quantity = player.remained_chips - self.raised[self.game_pointer] = all_in_quantity + self.raised[self.game_pointer] - player.bet(chips=all_in_quantity) - self.not_raise_num = 1 + # elif action == Action.ALL_IN: + # all_in_quantity = player.remained_chips + # self.raised[self.game_pointer] = all_in_quantity + self.raised[self.game_pointer] + # player.bet(chips=all_in_quantity) - elif action == Action.RAISE_POT: - self.raised[self.game_pointer] += self.dealer.pot - player.bet(chips=self.dealer.pot) - self.not_raise_num = 1 + # self.not_raise_num = 1 - elif action == Action.RAISE_HALF_POT: - quantity = int(self.dealer.pot / 2) - self.raised[self.game_pointer] += quantity - player.bet(chips=quantity) - self.not_raise_num = 1 + # elif action == Action.RAISE_POT: + # self.raised[self.game_pointer] += self.dealer.pot + # player.bet(chips=self.dealer.pot) + # self.not_raise_num = 1 + + # elif action == Action.RAISE_HALF_POT: + # quantity = int(self.dealer.pot / 2) + # self.raised[self.game_pointer] += quantity + # player.bet(chips=quantity) + # self.not_raise_num = 1 elif action == Action.FOLD: player.status = PlayerStatus.FOLDED @@ -142,22 +154,25 @@ def get_nolimit_legal_actions(self, players): diff = max(self.raised) - self.raised[self.game_pointer] # If the current player has no more chips after call, we cannot raise if diff > 0 and diff >= player.remained_chips: - full_actions.remove(Action.RAISE_HALF_POT) - full_actions.remove(Action.RAISE_POT) + # full_actions.remove(Action.RAISE_HALF_POT) + # full_actions.remove(Action.RAISE_POT) full_actions.remove(Action.ALL_IN) + full_actions.remove(Action.RAISE) # Even if we can raise, we have to check remained chips else: - if self.dealer.pot > player.remained_chips: - full_actions.remove(Action.RAISE_POT) + if player.remained_chips < self.last_raise: + full_actions.remove(Action.RAISE) + # if self.dealer.pot > player.remained_chips: + # full_actions.remove(Action.RAISE_POT) - if int(self.dealer.pot / 2) > player.remained_chips: - full_actions.remove(Action.RAISE_HALF_POT) + # if int(self.dealer.pot / 2) > player.remained_chips: + # full_actions.remove(Action.RAISE_HALF_POT) # Can't raise if the total raise amount is leq than the max raise amount of this round # If raise by pot, there is no such concern - if Action.RAISE_HALF_POT in full_actions and \ - int(self.dealer.pot / 2) + self.raised[self.game_pointer] <= max(self.raised): - full_actions.remove(Action.RAISE_HALF_POT) + # if Action.RAISE_HALF_POT in full_actions and \ + # int(self.dealer.pot / 2) + self.raised[self.game_pointer] <= max(self.raised): + # full_actions.remove(Action.RAISE_HALF_POT) return full_actions diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl index 3cda26e01..71236521e 100644 Binary files a/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl and b/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl differ diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl index bb130e631..36717717b 100644 Binary files a/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl and b/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl differ diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl index 2e48cb34a..daedbc2c4 100644 Binary files a/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl and b/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl differ diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl index 8341caaf4..8d4a6489e 100644 Binary files a/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl and b/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl differ