datamllab · wgzesg · Mar 24, 2022 · May 25, 2022 · May 26, 2022 · May 26, 2022
diff --git a/README.md b/README.md
@@ -19,6 +19,7 @@ RLCard is a toolkit for Reinforcement Learning (RL) in card games. It supports m
 *   Dou Dizhu Demo: [Demo](https://douzero.org/)
 *   Resources: [Awesome-Game-AI](https://github.com/datamllab/awesome-game-ai)
 *   Related Project: [DouZero Project](https://github.com/kwai/DouZero)
+*   Zhihu: https://zhuanlan.zhihu.com/p/526723604
 
 **Community:**
 *  **Slack**: Discuss in our [#rlcard-project](https://join.slack.com/t/rlcard/shared_invite/zt-rkvktsaq-xkMwz8BfKupCM6zGhO01xg) slack channel.
@@ -27,6 +28,7 @@ RLCard is a toolkit for Reinforcement Learning (RL) in card games. It supports m
 	*  Group 2: 117349516
 
 **News:**
+*   We have updated the tutorials in Jupyter Notebook to help you walk through RLCard! Please check [RLCard Tutorial](https://github.com/datamllab/rlcard-tutorial).
 *   All the algorithms can suppport [PettingZoo](https://github.com/PettingZoo-Team/PettingZoo) now. Please check [here](examples/pettingzoo). Thanks the contribtuion from [Yifei Cheng](https://github.com/ycheng517).
 *   Please follow [DouZero](https://github.com/kwai/DouZero), a strong Dou Dizhu AI and the [ICML 2021 paper](https://arxiv.org/abs/2106.06135). An online demo is available [here](https://douzero.org/). The algorithm is also integrated in RLCard. See [Training DMC on Dou Dizhu](docs/toy-examples.md#training-dmc-on-dou-dizhu).
 *   Our package is used in [PettingZoo](https://github.com/PettingZoo-Team/PettingZoo). Please check it out!

diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -19,6 +19,7 @@ RLCard是一款卡牌游戏强化学习 (Reinforcement Learning, RL) 的工具
 *   斗地主演示：[Demo](https://douzero.org/)
 *   资源：[Awesome-Game-AI](https://github.com/datamllab/awesome-game-ai)
 *   相关项目：[DouZero项目](https://github.com/kwai/DouZero)
+*   知乎：[https://zhuanlan.zhihu.com/p/526723604](https://zhuanlan.zhihu.com/p/526723604)
 
 **社区:**
 *  **Slack**: 在我们的[#rlcard-project](https://join.slack.com/t/rlcard/shared_invite/zt-rkvktsaq-xkMwz8BfKupCM6zGhO01xg) slack频道参与讨论.
@@ -27,6 +28,7 @@ RLCard是一款卡牌游戏强化学习 (Reinforcement Learning, RL) 的工具
     *  二群：117349516
 
 **新闻:**
+*   我们更新Jupyter Notebook的教程帮助您快速了解RLCard！请看 [RLCard 教程](https://github.com/datamllab/rlcard-tutorial).
 *   所有的算法都已支持[PettingZoo](https://github.com/PettingZoo-Team/PettingZoo)接口. 请点击[这里](examples/pettingzoo). 感谢[Yifei Cheng](https://github.com/ycheng517)的贡献。
 *   请关注[DouZero](https://github.com/kwai/DouZero), 一个强大的斗地主AI，以及[ICML 2021论文](https://arxiv.org/abs/2106.06135)。点击[此处](https://douzero.org/)进入在线演示。该算法同样集成到了RLCard中，详见[在斗地主中训练DMC](docs/toy-examples.md#training-dmc-on-dou-dizhu)。
 *   我们的项目被用在[PettingZoo](https://github.com/PettingZoo-Team/PettingZoo)中，去看看吧！

diff --git a/examples/human/nolimit_holdem_human.py b/examples/human/nolimit_holdem_human.py
@@ -6,46 +6,49 @@
 from rlcard import models
 from rlcard.agents import NolimitholdemHumanAgent as HumanAgent
 from rlcard.utils import print_card
-
 # Make environment
 env = rlcard.make('no-limit-holdem')
 
 human_agent = HumanAgent(env.num_actions)
 human_agent2 = HumanAgent(env.num_actions)
+human_agent3 = HumanAgent(env.num_actions)
+human_agent4 = HumanAgent(env.num_actions)
 # random_agent = RandomAgent(num_actions=env.num_actions)
 
-env.set_agents([human_agent, human_agent2])
+env.set_agents([human_agent, human_agent2, human_agent3, human_agent4])
 
 
 while (True):
     print(">> Start a new game")
-
-    trajectories, payoffs = env.run(is_training=False)
-    # If the human does not take the final action, we need to
-    # print other players action
-    final_state = trajectories[0][-1]
-    action_record = final_state['action_record']
-    state = final_state['raw_obs']
-    _action_list = []
-    for i in range(1, len(action_record)+1):
-        if action_record[-i][0] == state['current_player']:
-            break
-        _action_list.insert(0, action_record[-i])
-    for pair in _action_list:
-        print('>> Player', pair[0], 'chooses', pair[1])
-
-    # Let's take a look at what the agent card is
-    print('===============     Cards all Players    ===============')
-    for hands in env.get_perfect_information()['hand_cards']:
-        print_card(hands)
-
-    print('===============     Result     ===============')
-    if payoffs[0] > 0:
-        print('You win {} chips!'.format(payoffs[0]))
-    elif payoffs[0] == 0:
-        print('It is a tie.')
-    else:
-        print('You lose {} chips!'.format(-payoffs[0]))
-    print('')
+    while(True):
+        trajectories, payoffs = env.run(is_training=False)
+        # If the human does not take the final action, we need to
+        # print other players action
+        final_state = trajectories[0][-1]
+        action_record = final_state['action_record']
+        state = final_state['raw_obs']
+        _action_list = []
+        for i in range(1, len(action_record)+1):
+            if action_record[-i][0] == state['current_player']:
+                break
+            _action_list.insert(0, action_record[-i])
+        for pair in _action_list:
+            print('>> Player', pair[0], 'chooses', pair[1])
+
+        # Let's take a look at what the agent card is
+        print('===============     Cards all Players    ===============')
+        for hands in env.get_perfect_information()['hand_cards']:
+            print_card(hands)
+
+        print('===============     Result     ===============')
+        print(payoffs)
+        # if payoffs[0] > 0:
+        #     print('You win {} chips!'.format(payoffs[0]))
+        # elif payoffs[0] == 0:
+        #     print('It is a tie.')
+        # else:
+        #     print('You lose {} chips!'.format(-payoffs[0]))
+        # print('')
+        break
 
     input("Press any key to continue...")
diff --git a/examples/run_rl.py b/examples/run_rl.py
@@ -125,7 +125,7 @@ def train(args):
     parser.add_argument(
         '--cuda',
         type=str,
-        default='',
+        default='0',
     )
     parser.add_argument(
         '--seed',

diff --git a/rlcard/agents/human_agents/nolimit_holdem_human_agent.py b/rlcard/agents/human_agents/nolimit_holdem_human_agent.py
@@ -1,5 +1,5 @@
 from rlcard.utils.utils import print_card
-
+from rlcard.games.nolimitholdem.round import Action
 
 class HumanAgent(object):
     ''' A human agent for No Limit Holdem. It can be used to play against trained models
@@ -26,10 +26,16 @@ def step(state):
         '''
         _print_state(state['raw_obs'], state['action_record'])
         action = int(input('>> You choose action (integer): '))
-        while action < 0 or action >= len(state['legal_actions']):
+        amt = 0
+        if state['raw_legal_actions'][action] == Action.RAISE:
+            amt = int(input('>> Choose your raise amount: '))
+        while action < 0 or action >= len(state['legal_actions']) or\
+            (state['raw_legal_actions'][action] == Action.RAISE and amt < state['last_raise'] * 2):
             print('Action illegel...')
             action = int(input('>> Re-choose action (integer): '))
-        return state['raw_legal_actions'][action]
+            if state['raw_legal_actions'][action] == Action.RAISE:
+                amt = int(input('>> Choose your raise amount: '))
+        return state['raw_legal_actions'][action], amt
 
     def eval_step(self, state):
         ''' Predict the action given the curent state for evaluation. The same to step here.
@@ -70,4 +76,3 @@ def _print_state(state, action_record):
     print('\n=========== Actions You Can Choose ===========')
     print(', '.join([str(index) + ': ' + str(action) for index, action in enumerate(state['legal_actions'])]))
     print('')
-    print(state)
diff --git a/rlcard/envs/env.py b/rlcard/envs/env.py
@@ -163,6 +163,8 @@ def run(self, is_training=False):
             state = self.get_state(player_id)
             trajectories[player_id].append(state)
 
+        print(trajectories[0])
+
         # Payoffs
         payoffs = self.get_payoffs()
 

diff --git a/rlcard/envs/leducholdem.py b/rlcard/envs/leducholdem.py
@@ -61,7 +61,7 @@ def _extract_state(self, state):
         if public_card:
             obs[self.card2index[public_card]+3] = 1
         obs[state['my_chips']+6] = 1
-        obs[state['all_chips'][1]+20] = 1
+        obs[sum(state['all_chips'])-state['my_chips']+21] = 1
         extracted_state['obs'] = obs
 
         extracted_state['raw_obs'] = state

diff --git a/rlcard/envs/nolimitholdem.py b/rlcard/envs/nolimitholdem.py
@@ -9,7 +9,7 @@
 from rlcard.games.nolimitholdem.round import Action
 
 DEFAULT_GAME_CONFIG = {
-        'game_num_players': 2,
+        'game_num_players': 4,
         'chips_for_each': 100,
         'dealer_id': None,
         }
@@ -69,10 +69,10 @@ def _extract_state(self, state):
         obs[52] = float(my_chips)
         obs[53] = float(max(all_chips))
         extracted_state['obs'] = obs
-
         extracted_state['raw_obs'] = state
         extracted_state['raw_legal_actions'] = [a for a in state['legal_actions']]
         extracted_state['action_record'] = self.action_recorder
+        extracted_state['last_raise'] = state['last_raise']
 
         return extracted_state
 
@@ -114,6 +114,7 @@ def get_perfect_information(self):
         state['hand_cards'] = [[c.get_index() for c in self.game.players[i].hand] for i in range(self.num_players)]
         state['current_player'] = self.game.game_pointer
         state['legal_actions'] = self.game.get_legal_actions()
+        state['last_raise'] = self.game.round.last_raise
         return state
 
 
diff --git a/rlcard/games/nolimitholdem/game.py b/rlcard/games/nolimitholdem/game.py
@@ -113,7 +113,7 @@ def get_legal_actions(self):
         """
         return self.round.get_nolimit_legal_actions(players=self.players)
 
-    def step(self, action):
+    def step(self, action_tp):
         """
         Get the next state
 
@@ -126,7 +126,7 @@ def step(self, action):
                 (dict): next player's state
                 (int): next player id
         """
-
+        action, amt = action_tp
         if action not in self.get_legal_actions():
             print(action, self.get_legal_actions())
             print(self.get_state(self.game_pointer))
@@ -143,7 +143,7 @@ def step(self, action):
             self.history.append((r, b, r_c, d, p, ps))
 
         # Then we proceed to the next round
-        self.game_pointer = self.round.proceed_round(self.players, action)
+        self.game_pointer = self.round.proceed_round(self.players, action_tp)
 
         players_in_bypass = [1 if player.status in (PlayerStatus.FOLDED, PlayerStatus.ALLIN) else 0 for player in self.players]
         if self.num_players - sum(players_in_bypass) == 1:
@@ -206,6 +206,7 @@ def get_state(self, player_id):
         state['current_player'] = self.game_pointer
         state['pot'] = self.dealer.pot
         state['stage'] = self.stage
+        state['last_raise'] = self.round.last_raise
         return state
 
     def step_back(self):

diff --git a/rlcard/games/nolimitholdem/round.py b/rlcard/games/nolimitholdem/round.py
@@ -8,12 +8,13 @@
 class Action(Enum):
     FOLD = 0
     CHECK_CALL = 1
-    #CALL = 2
+    RAISE = 2
+    # CALL = 2
     # RAISE_3BB = 3
-    RAISE_HALF_POT = 2
-    RAISE_POT = 3
-    # RAISE_2POT = 5
-    ALL_IN = 4
+    # RAISE_HALF_POT = 2
+    # RAISE_POT = 3
+    # # RAISE_2POT = 5
+    # ALL_IN = 4
     # SMALL_BLIND = 7
     # BIG_BLIND = 8
 
@@ -45,6 +46,7 @@ def __init__(self, num_players, init_raise_amount, dealer, np_random):
 
         # Raised amount for each player
         self.raised = [0 for _ in range(self.num_players)]
+        self.last_raise = 0
 
     def start_new_round(self, game_pointer, raised=None):
         """
@@ -58,12 +60,13 @@ def start_new_round(self, game_pointer, raised=None):
         """
         self.game_pointer = game_pointer
         self.not_raise_num = 0
+        self.last_raise = 0
         if raised:
             self.raised = raised
         else:
             self.raised = [0 for _ in range(self.num_players)]
 
-    def proceed_round(self, players, action):
+    def proceed_round(self, players, action_tp):
         """
         Call functions from other classes to keep one round running
 
@@ -76,29 +79,38 @@ def proceed_round(self, players, action):
         """
         player = players[self.game_pointer]
 
+        action = action_tp[0]
         if action == Action.CHECK_CALL:
             diff = max(self.raised) - self.raised[self.game_pointer]
             self.raised[self.game_pointer] = max(self.raised)
             player.bet(chips=diff)
             self.not_raise_num += 1
+
+        elif action == Action.RAISE:
+            raise_amt = action_tp[1]
+            self.last_raise = raise_amt
+            self.raised[self.game_pointer] += raise_amt
+            player.bet(chips=raise_amt)
+            self.not_raise_num = 1
 
-        elif action == Action.ALL_IN:
-            all_in_quantity = player.remained_chips
-            self.raised[self.game_pointer] = all_in_quantity + self.raised[self.game_pointer]
-            player.bet(chips=all_in_quantity)
 
-            self.not_raise_num = 1
+        # elif action == Action.ALL_IN:
+        #     all_in_quantity = player.remained_chips
+        #     self.raised[self.game_pointer] = all_in_quantity + self.raised[self.game_pointer]
+        #     player.bet(chips=all_in_quantity)
 
-        elif action == Action.RAISE_POT:
-            self.raised[self.game_pointer] += self.dealer.pot
-            player.bet(chips=self.dealer.pot)
-            self.not_raise_num = 1
+        #     self.not_raise_num = 1
 
-        elif action == Action.RAISE_HALF_POT:
-            quantity = int(self.dealer.pot / 2)
-            self.raised[self.game_pointer] += quantity
-            player.bet(chips=quantity)
-            self.not_raise_num = 1
+        # elif action == Action.RAISE_POT:
+        #     self.raised[self.game_pointer] += self.dealer.pot
+        #     player.bet(chips=self.dealer.pot)
+        #     self.not_raise_num = 1
+
+        # elif action == Action.RAISE_HALF_POT:
+        #     quantity = int(self.dealer.pot / 2)
+        #     self.raised[self.game_pointer] += quantity
+        #     player.bet(chips=quantity)
+        #     self.not_raise_num = 1
 
         elif action == Action.FOLD:
             player.status = PlayerStatus.FOLDED
@@ -142,22 +154,25 @@ def get_nolimit_legal_actions(self, players):
         diff = max(self.raised) - self.raised[self.game_pointer]
         # If the current player has no more chips after call, we cannot raise
         if diff > 0 and diff >= player.remained_chips:
-            full_actions.remove(Action.RAISE_HALF_POT)
-            full_actions.remove(Action.RAISE_POT)
+            # full_actions.remove(Action.RAISE_HALF_POT)
+            # full_actions.remove(Action.RAISE_POT)
             full_actions.remove(Action.ALL_IN)
+            full_actions.remove(Action.RAISE)
         # Even if we can raise, we have to check remained chips
         else:
-            if self.dealer.pot > player.remained_chips:
-                full_actions.remove(Action.RAISE_POT)
+            if player.remained_chips < self.last_raise:
+                full_actions.remove(Action.RAISE)
+            # if self.dealer.pot > player.remained_chips:
+            #     full_actions.remove(Action.RAISE_POT)
 
-            if int(self.dealer.pot / 2) > player.remained_chips:
-                full_actions.remove(Action.RAISE_HALF_POT)
+            # if int(self.dealer.pot / 2) > player.remained_chips:
+            #     full_actions.remove(Action.RAISE_HALF_POT)
 
             # Can't raise if the total raise amount is leq than the max raise amount of this round
             # If raise by pot, there is no such concern
-            if Action.RAISE_HALF_POT in full_actions and \
-                int(self.dealer.pot / 2) + self.raised[self.game_pointer] <= max(self.raised):
-                full_actions.remove(Action.RAISE_HALF_POT)
+            # if Action.RAISE_HALF_POT in full_actions and \
+            #     int(self.dealer.pot / 2) + self.raised[self.game_pointer] <= max(self.raised):
+            #     full_actions.remove(Action.RAISE_HALF_POT)
 
         return full_actions
 

diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/average_policy.pkl
diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/iteration.pkl
diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/policy.pkl
diff --git a/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl b/rlcard/models/pretrained/leduc_holdem_cfr/regrets.pkl