From 446279fb64fb099fe50b97b21eda913a2c3e4044 Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Fri, 12 May 2017 14:46:17 +0800
Subject: [PATCH 01/14] =?UTF-8?q?=E6=B7=BB=E5=8A=A0beam=20search?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_chatbot/configs/config.py             |   3 +-
 tf_chatbot/lib/basic/advanced_seq2seq.py | 699 ++++++++++++-----------
 tf_chatbot/lib/data_utils.py             |   8 +-
 tf_chatbot/lib/predict.py                |   7 +-
 tf_chatbot/lib/seq2seq_model.py          | 245 +++++---
 tf_chatbot/lib/seq2seq_model_utils.py    |  28 +-
 6 files changed, 558 insertions(+), 432 deletions(-)

diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py
index 6e7f4a7..918ef9a 100644
--- a/tf_chatbot/configs/config.py
+++ b/tf_chatbot/configs/config.py
@@ -12,9 +12,10 @@
 tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm')
 tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training')
 
-tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size')
+tf.app.flags.DEFINE_integer('vocab_size', 1000, 'Dialog vocabulary size')
 tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer')
 tf.app.flags.DEFINE_integer('num_layers', 1, 'Numbers of layers in the model')
+tf.app.flags.DEFINE_integer('beam_search_size', 3, 'Size of beam search op')
 
 tf.app.flags.DEFINE_integer('max_train_data_size', 0, 'Limit on the size of training data (0: no limit)')
 tf.app.flags.DEFINE_integer('steps_per_checkpoint', 100, 'How many training steps to do per checkpoint')
diff --git a/tf_chatbot/lib/basic/advanced_seq2seq.py b/tf_chatbot/lib/basic/advanced_seq2seq.py
index 6925393..9812072 100644
--- a/tf_chatbot/lib/basic/advanced_seq2seq.py
+++ b/tf_chatbot/lib/basic/advanced_seq2seq.py
@@ -3,8 +3,8 @@
 from __future__ import print_function
 
 # We disable pylint because we need python3 compatibility.
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from six.moves import zip  # pylint: disable=redefined-builtin
+# from six.moves import xrange  # pylint: disable=redefined-builtin
+# from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell
@@ -19,6 +19,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 from tensorflow import multinomial, squeeze
+import tensorflow as tf
 
 # TODO(ebrevdo): Remove once _linear is fully deprecated.
 linear = core_rnn_cell_impl._linear  # pylint: disable=protected-access
@@ -27,18 +28,18 @@
 def _extract_sample_and_embed(embedding,
                               output_projection=None,
                               update_embedding=True):
+    def loop_function(prev, _):
+        if output_projection is not None:
+            prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
+            # prev_symbol = math_ops.argmax(prev, 1)
+            prev_symbol = squeeze(multinomial(prev, 1), axis=1)
+        emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
+        if not update_embedding:
+            emb_prev = array_ops.stop_gradient(emb_prev)
+        return emb_prev
 
-  def loop_function(prev, _):
-    if output_projection is not None:
-      prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
-    #prev_symbol = math_ops.argmax(prev, 1)
-      prev_symbol = squeeze(multinomial(prev, 1), axis=1)
-    emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
-    if not update_embedding:
-      emb_prev = array_ops.stop_gradient(emb_prev)
-    return emb_prev
+    return loop_function
 
-  return loop_function
 
 def embedding_attention_sampled_seq2seq(encoder_inputs,
                                         decoder_inputs,
@@ -51,121 +52,124 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
                                         feed_previous=False,
                                         dtype=None,
                                         scope=None,
-                                        initial_state_attention=False):
-  """Embedding sequence-to-sequence model with attention.
-
-  This model first embeds encoder_inputs by a newly created embedding (of shape
-  [num_encoder_symbols x input_size]). Then it runs an RNN to encode
-  embedded encoder_inputs into a state vector. It keeps the outputs of this
-  RNN at every step to use for attention later. Next, it embeds decoder_inputs
-  by another newly created embedding (of shape [num_decoder_symbols x
-  input_size]). Then it runs attention decoder, initialized with the last
-  encoder state, on embedded decoder_inputs and attending to encoder outputs.
-
-  Warning: when output_projection is None, the size of the attention vectors
-  and variables will be made proportional to num_decoder_symbols, can be large.
-
-  Args:
-    encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
-    num_encoder_symbols: Integer; number of symbols on the encoder side.
-    num_decoder_symbols: Integer; number of symbols on the decoder side.
-    embedding_size: Integer, the length of the embedding vector for each symbol.
-    num_heads: Number of attention heads that read from attention_states.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [output_size x num_decoder_symbols] and B has
-      shape [num_decoder_symbols]; if provided and feed_previous=True, each
-      fed previous output will first be multiplied by W and added B.
-    feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
-      of decoder_inputs will be used (the "GO" symbol), and all other decoder
-      inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-      If False, decoder_inputs are used as given (the standard decoder case).
-    dtype: The dtype of the initial RNN state (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_attention_seq2seq".
-    initial_state_attention: If False (default), initial attentions are zero.
-      If True, initialize the attentions from the initial state and attention
-      states.
-
-  Returns:
-    A tuple of the form (outputs, state), where:
-      outputs: A list of the same length as decoder_inputs of 2D Tensors with
-        shape [batch_size x num_decoder_symbols] containing the generated
-        outputs.
-      state: The state of each decoder cell at the final time-step.
-        It is a 2D Tensor of shape [batch_size x cell.state_size].
-  """
-  with variable_scope.variable_scope(
-      scope or "embedding_attention_seq2seq", dtype=dtype) as scope:
-    dtype = scope.dtype
-    # Encoder.
-    encoder_cell = core_rnn_cell.EmbeddingWrapper(
-        cell,
-        embedding_classes=num_encoder_symbols,
-        embedding_size=embedding_size)
-    encoder_outputs, encoder_state = core_rnn.static_rnn(
-        encoder_cell, encoder_inputs, dtype=dtype)
-
-    # First calculate a concatenation of encoder outputs to put attention on.
-    top_states = [
-        array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs
-    ]
-    attention_states = array_ops.concat(top_states, 1)
-
-    # Decoder.
-    output_size = None
-    if output_projection is None:
-      cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
-      output_size = num_decoder_symbols
-
-    if isinstance(feed_previous, bool):
-      return embedding_attention_decoder(
-          decoder_inputs,
-          encoder_state,
-          attention_states,
-          cell,
-          num_decoder_symbols,
-          embedding_size,
-          num_heads=num_heads,
-          output_size=output_size,
-          output_projection=output_projection,
-          feed_previous=feed_previous,
-          initial_state_attention=initial_state_attention)
-
-    # If feed_previous is a Tensor, we construct 2 graphs and use cond.
-    def decoder(feed_previous_bool):
-      reuse = None if feed_previous_bool else True
-      with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=reuse) as scope:
-        outputs, state = embedding_attention_decoder(
-            decoder_inputs,
-            encoder_state,
-            attention_states,
+                                        initial_state_attention=False,
+                                        batch_size=None):
+    """Embedding sequence-to-sequence model with attention.
+
+    This model first embeds encoder_inputs by a newly created embedding (of shape
+    [num_encoder_symbols x input_size]). Then it runs an RNN to encode
+    embedded encoder_inputs into a state vector. It keeps the outputs of this
+    RNN at every step to use for attention later. Next, it embeds decoder_inputs
+    by another newly created embedding (of shape [num_decoder_symbols x
+    input_size]). Then it runs attention decoder, initialized with the last
+    encoder state, on embedded decoder_inputs and attending to encoder outputs.
+
+    Warning: when output_projection is None, the size of the attention vectors
+    and variables will be made proportional to num_decoder_symbols, can be large.
+
+    Args:
+      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
+      cell: core_rnn_cell.RNNCell defining the cell function and size.
+      num_encoder_symbols: Integer; number of symbols on the encoder side.
+      num_decoder_symbols: Integer; number of symbols on the decoder side.
+      embedding_size: Integer, the length of the embedding vector for each symbol.
+      num_heads: Number of attention heads that read from attention_states.
+      output_projection: None or a pair (W, B) of output projection weights and
+        biases; W has shape [output_size x num_decoder_symbols] and B has
+        shape [num_decoder_symbols]; if provided and feed_previous=True, each
+        fed previous output will first be multiplied by W and added B.
+      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
+        of decoder_inputs will be used (the "GO" symbol), and all other decoder
+        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
+        If False, decoder_inputs are used as given (the standard decoder case).
+      dtype: The dtype of the initial RNN state (default: tf.float32).
+      scope: VariableScope for the created subgraph; defaults to
+        "embedding_attention_seq2seq".
+      initial_state_attention: If False (default), initial attentions are zero.
+        If True, initialize the attentions from the initial state and attention
+        states.
+
+    Returns:
+      A tuple of the form (outputs, state), where:
+        outputs: A list of the same length as decoder_inputs of 2D Tensors with
+          shape [batch_size x num_decoder_symbols] containing the generated
+          outputs.
+        state: The state of each decoder cell at the final time-step.
+          It is a 2D Tensor of shape [batch_size x cell.state_size].
+    """
+    with variable_scope.variable_scope(
+                    scope or "embedding_attention_seq2seq", dtype=dtype) as scope:
+        dtype = scope.dtype
+        # Encoder.
+        encoder_cell = core_rnn_cell.EmbeddingWrapper(
             cell,
-            num_decoder_symbols,
-            embedding_size,
-            num_heads=num_heads,
-            output_size=output_size,
-            output_projection=output_projection,
-            feed_previous=feed_previous_bool,
-            update_embedding_for_previous=False,
-            initial_state_attention=initial_state_attention)
-        state_list = [state]
-        if nest.is_sequence(state):
-          state_list = nest.flatten(state)
-        return outputs + state_list
-
-    outputs_and_state = control_flow_ops.cond(feed_previous,
-                                              lambda: decoder(True),
-                                              lambda: decoder(False))
-    outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
-    state_list = outputs_and_state[outputs_len:]
-    state = state_list[0]
-    if nest.is_sequence(encoder_state):
-      state = nest.pack_sequence_as(
-          structure=encoder_state, flat_sequence=state_list)
-    return outputs_and_state[:outputs_len], state
+            embedding_classes=num_encoder_symbols,
+            embedding_size=embedding_size)
+        encoder_outputs, encoder_state = core_rnn.static_rnn(
+            encoder_cell, encoder_inputs, dtype=dtype)
+
+        # First calculate a concatenation of encoder outputs to put attention on.
+        top_states = [
+            array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs
+            ]
+        attention_states = array_ops.concat(top_states, 1)
+
+        # Decoder.
+        output_size = None
+        if output_projection is None:
+            cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
+            output_size = num_decoder_symbols
+
+        if isinstance(feed_previous, bool):
+            return embedding_attention_decoder(
+                decoder_inputs,
+                encoder_state,
+                attention_states,
+                cell,
+                num_decoder_symbols,
+                embedding_size,
+                num_heads=num_heads,
+                output_size=output_size,
+                output_projection=output_projection,
+                feed_previous=feed_previous,
+                initial_state_attention=initial_state_attention)
+
+        else:
+            # If feed_previous is a Tensor, we construct 2 graphs and use cond.
+            def decoder(feed_previous_bool):
+                reuse = None if feed_previous_bool else True
+                with variable_scope.variable_scope(
+                        variable_scope.get_variable_scope(), reuse=reuse) as scope:
+                    outputs, state = embedding_attention_decoder(
+                        decoder_inputs,
+                        encoder_state,
+                        attention_states,
+                        cell,
+                        num_decoder_symbols,
+                        embedding_size,
+                        num_heads=num_heads,
+                        output_size=output_size,
+                        output_projection=output_projection,
+                        feed_previous=feed_previous_bool,
+                        update_embedding_for_previous=False,
+                        initial_state_attention=initial_state_attention)
+                    state_list = [state]
+                    if nest.is_sequence(state):
+                        state_list = nest.flatten(state)
+                    return outputs + state_list
+
+            outputs_and_state = control_flow_ops.cond(feed_previous,
+                                                      lambda: decoder(True),
+                                                      lambda: decoder(False))
+            outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
+            state_list = outputs_and_state[outputs_len:]
+            state = state_list[0]
+            if nest.is_sequence(encoder_state):
+                state = nest.pack_sequence_as(
+                    structure=encoder_state, flat_sequence=state_list)
+            return outputs_and_state[:outputs_len], state
+
 
 def embedding_attention_decoder(decoder_inputs,
                                 initial_state,
@@ -181,76 +185,77 @@ def embedding_attention_decoder(decoder_inputs,
                                 dtype=None,
                                 scope=None,
                                 initial_state_attention=False):
-  """RNN decoder with embedding and attention and a pure-decoding option.
-
-  Args:
-    decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
-    initial_state: 2D Tensor [batch_size x cell.state_size].
-    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: core_rnn_cell.RNNCell defining the cell function.
-    num_symbols: Integer, how many symbols come into the embedding.
-    embedding_size: Integer, the length of the embedding vector for each symbol.
-    num_heads: Number of attention heads that read from attention_states.
-    output_size: Size of the output vectors; if None, use output_size.
-    output_projection: None or a pair (W, B) of output projection weights and
-      biases; W has shape [output_size x num_symbols] and B has shape
-      [num_symbols]; if provided and feed_previous=True, each fed previous
-      output will first be multiplied by W and added B.
-    feed_previous: Boolean; if True, only the first of decoder_inputs will be
-      used (the "GO" symbol), and all other decoder inputs will be generated by:
-        next = embedding_lookup(embedding, argmax(previous_output)),
-      In effect, this implements a greedy decoder. It can also be used
-      during training to emulate http://arxiv.org/abs/1506.03099.
-      If False, decoder_inputs are used as given (the standard decoder case).
-    update_embedding_for_previous: Boolean; if False and feed_previous=True,
-      only the embedding for the first symbol of decoder_inputs (the "GO"
-      symbol) will be updated by back propagation. Embeddings for the symbols
-      generated from the decoder itself remain unchanged. This parameter has
-      no effect if feed_previous=False.
-    dtype: The dtype to use for the RNN initial states (default: tf.float32).
-    scope: VariableScope for the created subgraph; defaults to
-      "embedding_attention_decoder".
-    initial_state_attention: If False (default), initial attentions are zero.
-      If True, initialize the attentions from the initial state and attention
-      states -- useful when we wish to resume decoding from a previously
-      stored decoder state and attention states.
-
-  Returns:
-    A tuple of the form (outputs, state), where:
-      outputs: A list of the same length as decoder_inputs of 2D Tensors with
-        shape [batch_size x output_size] containing the generated outputs.
-      state: The state of each decoder cell at the final time-step.
-        It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: When output_projection has the wrong shape.
-  """
-  if output_size is None:
-    output_size = cell.output_size
-  if output_projection is not None:
-    proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
-    proj_biases.get_shape().assert_is_compatible_with([num_symbols])
-
-  with variable_scope.variable_scope(
-      scope or "embedding_attention_decoder", dtype=dtype) as scope:
-
-    embedding = variable_scope.get_variable("embedding",
-                                            [num_symbols, embedding_size])
-    loop_function = _extract_sample_and_embed(
-        embedding, output_projection,
-        update_embedding_for_previous) if feed_previous else None
-    emb_inp = [
-        embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs
-    ]
-    return attention_decoder(
-        emb_inp,
-        initial_state,
-        attention_states,
-        cell,
-        output_size=output_size,
-        num_heads=num_heads,
-        loop_function=loop_function,
-        initial_state_attention=initial_state_attention)
+    """RNN decoder with embedding and attention and a pure-decoding option.
+
+    Args:
+      decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
+      initial_state: 2D Tensor [batch_size x cell.state_size].
+      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+      cell: core_rnn_cell.RNNCell defining the cell function.
+      num_symbols: Integer, how many symbols come into the embedding.
+      embedding_size: Integer, the length of the embedding vector for each symbol.
+      num_heads: Number of attention heads that read from attention_states.
+      output_size: Size of the output vectors; if None, use output_size.
+      output_projection: None or a pair (W, B) of output projection weights and
+        biases; W has shape [output_size x num_symbols] and B has shape
+        [num_symbols]; if provided and feed_previous=True, each fed previous
+        output will first be multiplied by W and added B.
+      feed_previous: Boolean; if True, only the first of decoder_inputs will be
+        used (the "GO" symbol), and all other decoder inputs will be generated by:
+          next = embedding_lookup(embedding, argmax(previous_output)),
+        In effect, this implements a greedy decoder. It can also be used
+        during training to emulate http://arxiv.org/abs/1506.03099.
+        If False, decoder_inputs are used as given (the standard decoder case).
+      update_embedding_for_previous: Boolean; if False and feed_previous=True,
+        only the embedding for the first symbol of decoder_inputs (the "GO"
+        symbol) will be updated by back propagation. Embeddings for the symbols
+        generated from the decoder itself remain unchanged. This parameter has
+        no effect if feed_previous=False.
+      dtype: The dtype to use for the RNN initial states (default: tf.float32).
+      scope: VariableScope for the created subgraph; defaults to
+        "embedding_attention_decoder".
+      initial_state_attention: If False (default), initial attentions are zero.
+        If True, initialize the attentions from the initial state and attention
+        states -- useful when we wish to resume decoding from a previously
+        stored decoder state and attention states.
+
+    Returns:
+      A tuple of the form (outputs, state), where:
+        outputs: A list of the same length as decoder_inputs of 2D Tensors with
+          shape [batch_size x output_size] containing the generated outputs.
+        state: The state of each decoder cell at the final time-step.
+          It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+    Raises:
+      ValueError: When output_projection has the wrong shape.
+    """
+    if output_size is None:
+        output_size = cell.output_size
+    if output_projection is not None:
+        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
+        proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+    with variable_scope.variable_scope(
+                    scope or "embedding_attention_decoder", dtype=dtype) as scope:
+
+        embedding = variable_scope.get_variable("embedding",
+                                                [num_symbols, embedding_size])
+        loop_function = _extract_sample_and_embed(
+            embedding, output_projection,
+            update_embedding_for_previous) if feed_previous else None
+        emb_inp = [
+            embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs
+            ]
+        return attention_decoder(
+            emb_inp,
+            initial_state,
+            attention_states,
+            cell,
+            output_size=output_size,
+            num_heads=num_heads,
+            loop_function=loop_function,
+            initial_state_attention=initial_state_attention)
+
 
 def attention_decoder(decoder_inputs,
                       initial_state,
@@ -262,154 +267,154 @@ def attention_decoder(decoder_inputs,
                       dtype=None,
                       scope=None,
                       initial_state_attention=False):
-  """RNN decoder with attention for the sequence-to-sequence model.
-
-  In this context "attention" means that, during decoding, the RNN can look up
-  information in the additional tensor attention_states, and it does this by
-  focusing on a few entries from the tensor. This model has proven to yield
-  especially good results in a number of sequence-to-sequence tasks. This
-  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
-  details). It is recommended for complex sequence-to-sequence tasks.
-
-  Args:
-    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
-    initial_state: 2D Tensor [batch_size x cell.state_size].
-    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-    cell: core_rnn_cell.RNNCell defining the cell function and size.
-    output_size: Size of the output vectors; if None, we use cell.output_size.
-    num_heads: Number of attention heads that read from attention_states.
-    loop_function: If not None, this function will be applied to i-th output
-      in order to generate i+1-th input, and decoder_inputs will be ignored,
-      except for the first element ("GO" symbol). This can be used for decoding,
-      but also for training to emulate http://arxiv.org/abs/1506.03099.
-      Signature -- loop_function(prev, i) = next
-        * prev is a 2D Tensor of shape [batch_size x output_size],
-        * i is an integer, the step number (when advanced control is needed),
-        * next is a 2D Tensor of shape [batch_size x input_size].
-    dtype: The dtype to use for the RNN initial state (default: tf.float32).
-    scope: VariableScope for the created subgraph; default: "attention_decoder".
-    initial_state_attention: If False (default), initial attentions are zero.
-      If True, initialize the attentions from the initial state and attention
-      states -- useful when we wish to resume decoding from a previously
-      stored decoder state and attention states.
-
-  Returns:
-    A tuple of the form (outputs, state), where:
-      outputs: A list of the same length as decoder_inputs of 2D Tensors of
-        shape [batch_size x output_size]. These represent the generated outputs.
-        Output i is computed from input i (which is either the i-th element
-        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
-        First, we run the cell on a combination of the input and previous
-        attention masks:
-          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
-        Then, we calculate new attention masks:
-          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
-        and then we calculate the output:
-          output = linear(cell_output, new_attn).
-      state: The state of each decoder cell the final time-step.
-        It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-  Raises:
-    ValueError: when num_heads is not positive, there are no inputs, shapes
-      of attention_states are not set, or input size cannot be inferred
-      from the input.
-  """
-  if not decoder_inputs:
-    raise ValueError("Must provide at least 1 input to attention decoder.")
-  if num_heads < 1:
-    raise ValueError("With less than 1 heads, use a non-attention decoder.")
-  if attention_states.get_shape()[2].value is None:
-    raise ValueError("Shape[2] of attention_states must be known: %s" %
-                     attention_states.get_shape())
-  if output_size is None:
-    output_size = cell.output_size
-
-  with variable_scope.variable_scope(
-      scope or "attention_decoder", dtype=dtype) as scope:
-    dtype = scope.dtype
-
-    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
-    attn_length = attention_states.get_shape()[1].value
-    if attn_length is None:
-      attn_length = array_ops.shape(attention_states)[1]
-    attn_size = attention_states.get_shape()[2].value
-
-    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
-    hidden = array_ops.reshape(attention_states,
-                               [-1, attn_length, 1, attn_size])
-    hidden_features = []
-    v = []
-    attention_vec_size = attn_size  # Size of query vectors for attention.
-    for a in xrange(num_heads):
-      k = variable_scope.get_variable("AttnW_%d" % a,
-                                      [1, 1, attn_size, attention_vec_size])
-      hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
-      v.append(
-          variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))
-
-    state = initial_state
-
-    def attention(query):
-      """Put attention masks on hidden using hidden_features and query."""
-      ds = []  # Results of attention reads will be stored here.
-      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
-        query_list = nest.flatten(query)
-        for q in query_list:  # Check that ndims == 2 if specified.
-          ndims = q.get_shape().ndims
-          if ndims:
-            assert ndims == 2
-        query = array_ops.concat(query_list, 1)
-      for a in xrange(num_heads):
-        with variable_scope.variable_scope("Attention_%d" % a):
-          y = linear(query, attention_vec_size, True)
-          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
-          # Attention mask is a softmax of v^T * tanh(...).
-          s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y),
-                                  [2, 3])
-          a = nn_ops.softmax(s)
-          # Now calculate the attention-weighted vector d.
-          d = math_ops.reduce_sum(
-              array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
-          ds.append(array_ops.reshape(d, [-1, attn_size]))
-      return ds
-
-    outputs = []
-    prev = None
-    batch_attn_size = array_ops.stack([batch_size, attn_size])
-    attns = [
-        array_ops.zeros(
-            batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
-    ]
-    for a in attns:  # Ensure the second shape of attention vectors is set.
-      a.set_shape([None, attn_size])
-    if initial_state_attention:
-      attns = attention(initial_state)
-    for i, inp in enumerate(decoder_inputs):
-      if i > 0:
-        variable_scope.get_variable_scope().reuse_variables()
-      # If loop_function is set, we use it instead of decoder_inputs.
-      if loop_function is not None and prev is not None:
-        with variable_scope.variable_scope("loop_function", reuse=True):
-          inp = loop_function(prev, i)
-      # Merge input and previous attentions into one vector of the right size.
-      input_size = inp.get_shape().with_rank(2)[1]
-      if input_size.value is None:
-        raise ValueError("Could not infer input size from input: %s" % inp.name)
-      x = linear([inp] + attns, input_size, True)
-      # Run the RNN.
-      cell_output, state = cell(x, state)
-      # Run the attention mechanism.
-      if i == 0 and initial_state_attention:
-        with variable_scope.variable_scope(
-            variable_scope.get_variable_scope(), reuse=True):
-          attns = attention(state)
-      else:
-        attns = attention(state)
-
-      with variable_scope.variable_scope("AttnOutputProjection"):
-        output = linear([cell_output] + attns, output_size, True)
-      if loop_function is not None:
-        prev = output
-      outputs.append(output)
-
-  return outputs, state
\ No newline at end of file
+    """RNN decoder with attention for the sequence-to-sequence model.
+
+    In this context "attention" means that, during decoding, the RNN can look up
+    information in the additional tensor attention_states, and it does this by
+    focusing on a few entries from the tensor. This model has proven to yield
+    especially good results in a number of sequence-to-sequence tasks. This
+    implementation is based on http://arxiv.org/abs/1412.7449 (see below for
+    details). It is recommended for complex sequence-to-sequence tasks.
+
+    Args:
+      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
+      initial_state: 2D Tensor [batch_size x cell.state_size].
+      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
+      cell: core_rnn_cell.RNNCell defining the cell function and size.
+      output_size: Size of the output vectors; if None, we use cell.output_size.
+      num_heads: Number of attention heads that read from attention_states.
+      loop_function: If not None, this function will be applied to i-th output
+        in order to generate i+1-th input, and decoder_inputs will be ignored,
+        except for the first element ("GO" symbol). This can be used for decoding,
+        but also for training to emulate http://arxiv.org/abs/1506.03099.
+        Signature -- loop_function(prev, i) = next
+          * prev is a 2D Tensor of shape [batch_size x output_size],
+          * i is an integer, the step number (when advanced control is needed),
+          * next is a 2D Tensor of shape [batch_size x input_size].
+      dtype: The dtype to use for the RNN initial state (default: tf.float32).
+      scope: VariableScope for the created subgraph; default: "attention_decoder".
+      initial_state_attention: If False (default), initial attentions are zero.
+        If True, initialize the attentions from the initial state and attention
+        states -- useful when we wish to resume decoding from a previously
+        stored decoder state and attention states.
+
+    Returns:
+      A tuple of the form (outputs, state), where:
+        outputs: A list of the same length as decoder_inputs of 2D Tensors of
+          shape [batch_size x output_size]. These represent the generated outputs.
+          Output i is computed from input i (which is either the i-th element
+          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
+          First, we run the cell on a combination of the input and previous
+          attention masks:
+            cell_output, new_state = cell(linear(input, prev_attn), prev_state).
+          Then, we calculate new attention masks:
+            new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
+          and then we calculate the output:
+            output = linear(cell_output, new_attn).
+        state: The state of each decoder cell the final time-step.
+          It is a 2D Tensor of shape [batch_size x cell.state_size].
+
+    Raises:
+      ValueError: when num_heads is not positive, there are no inputs, shapes
+        of attention_states are not set, or input size cannot be inferred
+        from the input.
+    """
+    if not decoder_inputs:
+        raise ValueError("Must provide at least 1 input to attention decoder.")
+    if num_heads < 1:
+        raise ValueError("With less than 1 heads, use a non-attention decoder.")
+    if attention_states.get_shape()[2].value is None:
+        raise ValueError("Shape[2] of attention_states must be known: %s" %
+                         attention_states.get_shape())
+    if output_size is None:
+        output_size = cell.output_size
+
+    with variable_scope.variable_scope(
+                    scope or "attention_decoder", dtype=dtype) as scope:
+        dtype = scope.dtype
+
+        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
+        attn_length = attention_states.get_shape()[1].value
+        if attn_length is None:
+            attn_length = array_ops.shape(attention_states)[1]
+        attn_size = attention_states.get_shape()[2].value
+
+        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
+        hidden = array_ops.reshape(attention_states,
+                                   [-1, attn_length, 1, attn_size])
+        hidden_features = []
+        v = []
+        attention_vec_size = attn_size  # Size of query vectors for attention.
+        for a in xrange(num_heads):
+            k = variable_scope.get_variable("AttnW_%d" % a,
+                                            [1, 1, attn_size, attention_vec_size])
+            hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
+            v.append(
+                variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))
+
+        state = initial_state
+
+        def attention(query):
+            """Put attention masks on hidden using hidden_features and query."""
+            ds = []  # Results of attention reads will be stored here.
+            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
+                query_list = nest.flatten(query)
+                for q in query_list:  # Check that ndims == 2 if specified.
+                    ndims = q.get_shape().ndims
+                    if ndims:
+                        assert ndims == 2
+                query = array_ops.concat(query_list, 1)
+            for a in xrange(num_heads):
+                with variable_scope.variable_scope("Attention_%d" % a):
+                    y = linear(query, attention_vec_size, True)
+                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
+                    # Attention mask is a softmax of v^T * tanh(...).
+                    s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y),
+                                            [2, 3])
+                    a = nn_ops.softmax(s)
+                    # Now calculate the attention-weighted vector d.
+                    d = math_ops.reduce_sum(
+                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
+                    ds.append(array_ops.reshape(d, [-1, attn_size]))
+            return ds
+
+        outputs = []
+        prev = None
+        batch_attn_size = array_ops.stack([batch_size, attn_size])
+        attns = [
+            array_ops.zeros(
+                batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
+            ]
+        for a in attns:  # Ensure the second shape of attention vectors is set.
+            a.set_shape([None, attn_size])
+        if initial_state_attention:
+            attns = attention(initial_state)
+        for i, inp in enumerate(decoder_inputs):
+            if i > 0:
+                variable_scope.get_variable_scope().reuse_variables()
+            # If loop_function is set, we use it instead of decoder_inputs.
+            if loop_function is not None and prev is not None:
+                with variable_scope.variable_scope("loop_function", reuse=True):
+                    inp = loop_function(prev, i)
+            # Merge input and previous attentions into one vector of the right size.
+            input_size = inp.get_shape().with_rank(2)[1]
+            if input_size.value is None:
+                raise ValueError("Could not infer input size from input: %s" % inp.name)
+            x = linear([inp] + attns, input_size, True)
+            # Run the RNN.
+            cell_output, state = cell(x, state)
+            # Run the attention mechanism.
+            if i == 0 and initial_state_attention:
+                with variable_scope.variable_scope(
+                        variable_scope.get_variable_scope(), reuse=True):
+                    attns = attention(state)
+            else:
+                attns = attention(state)
+
+            with variable_scope.variable_scope("AttnOutputProjection"):
+                output = linear([cell_output] + attns, output_size, True)
+            if loop_function is not None:
+                prev = output
+            outputs.append(output)
+
+    return outputs, state
diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py
index 0c11bbd..cb976ee 100644
--- a/tf_chatbot/lib/data_utils.py
+++ b/tf_chatbot/lib/data_utils.py
@@ -70,7 +70,7 @@ def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
     if not gfile.Exists(vocabulary_path):
         print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
         vocab = {}
-        data = json.load(open(data_path, encoding=_ENCODING))
+        data = json.load(open(data_path), encoding=_ENCODING)
         counter = 0
         for ((q,qe),(a,ae)) in data:
             counter += 1
@@ -122,8 +122,8 @@ def sentence_to_token_ids(sentence, vocabulary,
     else:
         words = basic_tokenizer(sentence)
     if not normalize_digits:
-        return [vocabulary.get(w, UNK_ID) for w in words]
-    return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]
+        return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words]
+    return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words]
 
 def data_to_token_ids_bak(data_path, target_path, vocabulary_path,
                       tokenizer=None, normalize_digits=True):
@@ -147,7 +147,7 @@ def data_to_token_ids(data_path, target_path, vocabulary_path,
         print("Tokenizing data in %s" % data_path)
         vocab, _ = initialize_vocabulary(vocabulary_path)
         with gfile.GFile(target_path, mode='w') as tokens_file:
-            data = json.load(open(data_path, encoding=_ENCODING))
+            data = json.load(open(data_path), encoding=_ENCODING)
             counter = 0
             for ((q,qe),(a,ae)) in data:
                 counter += 1
diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index 01ecf44..00b23f7 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -9,7 +9,7 @@
 
 def predict():
     def _get_test_dataset():
-        data = json.load(open(TEST_DATASET_PATH, encoding=data_utils._ENCODING))
+        data = json.load(open(TEST_DATASET_PATH))
         test_sentences = [q for ((q, qe), _) in data]
         return test_sentences
 
@@ -27,7 +27,8 @@ def _get_test_dataset():
         test_dataset = _get_test_dataset()
 
         for sentence in test_dataset:
-            predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess)
-            print(sentence, '->', predicted_sentence)
+            predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
+            print(sentence, '->')
+            print(predicted_sentence)
 
             results_fh.write(predicted_sentence + '\n')
\ No newline at end of file
diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index f0fa16b..61ecde3 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -9,9 +9,10 @@
 import tensorflow as tf
 
 import tf_chatbot.lib.data_utils as data_utils
-from tensorflow.contrib.legacy_seq2seq import model_with_buckets, embedding_attention_seq2seq
+from tensorflow.contrib.legacy_seq2seq import model_with_buckets, sequence_loss, embedding_attention_decoder
 from tf_chatbot.lib.basic.advanced_seq2seq import embedding_attention_sampled_seq2seq
-from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell
+from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper
+
 
 class Seq2SeqModel(object):
     """Sequence-to-sequence model with attention and for multiple buckets.
@@ -40,6 +41,7 @@ def __init__(self,
                  use_lstm=False,
                  num_samples=512,
                  forward_only=False,
+                 beam_search_size=1,
                  dtype=tf.float32):
         """Create the model.
         Args:
@@ -72,6 +74,7 @@ def __init__(self,
         self.learning_rate_decay_op = self.learning_rate.assign(
             self.learning_rate * learning_rate_decay_factor)
         self.global_step = tf.Variable(0, trainable=False)
+        self.beam_search_size = beam_search_size
 
         # If we use sampled softmax, we need an output projection.
         output_projection = None
@@ -113,8 +116,65 @@ def single_cell():
         if num_layers > 1:
             cell = MultiRNNCell([single_cell() for _ in range(num_layers)])
 
+        self.model_encoder_states = {}
+        self.model_attention_states = {}
+        self.topk_probs = []
+        self.topk_ids = []
+
         # The seq2seq function: we use embedding for the input and attention.
-        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
+        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode, bucket_id):
+
+            def embedding_attention_sampled_seq2seq(encoder_inputs,
+                                        decoder_inputs,
+                                        cell,
+                                        num_encoder_symbols,
+                                        num_decoder_symbols,
+                                        embedding_size,
+                                        bucket_index,
+                                        num_heads=1,
+                                        output_projection=None,
+                                        feed_previous=False,
+                                        initial_state_attention=False,
+                                        dtype=tf.float32):
+                with tf.variable_scope("embedding_attention_sampled_seq2seq"):
+                    encoder_cell = EmbeddingWrapper(
+                        cell,
+                        embedding_classes=num_encoder_symbols,
+                        embedding_size=embedding_size
+                    )
+                    encoder_outputs, encoder_state = static_rnn(
+                        encoder_cell, encoder_inputs, dtype=dtype)
+
+                    top_states = [
+                        tf.reshape(e, [-1,1, cell.output_size]) for e in encoder_outputs
+                    ]
+                    attention_states = tf.concat(top_states, 1)
+
+                    self.model_encoder_states[bucket_index] = encoder_state
+                    self.model_attention_states[bucket_index] = attention_states
+
+                    output_size = None
+                    if output_projection is None:
+                        cell = OutputProjectionWrapper(cell, num_decoder_symbols)
+                        output_size = num_decoder_symbols
+
+                    if isinstance(feed_previous, bool):
+                        return  embedding_attention_decoder(
+                            decoder_inputs,
+                            self.model_encoder_states[bucket_index], #encoder_state,
+                            self.model_attention_states[bucket_index], #attention_states,
+                            cell,
+                            num_decoder_symbols,
+                            embedding_size,
+                            num_heads=num_heads,
+                            output_size=output_size,
+                            output_projection=output_projection,
+                            feed_previous=feed_previous,
+                            initial_state_attention=initial_state_attention)
+
+                    else:
+                        raise NotImplementedError()
+
             return embedding_attention_sampled_seq2seq(
                 encoder_inputs,
                 decoder_inputs,
@@ -122,9 +182,9 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                 num_encoder_symbols=source_vocab_size,
                 num_decoder_symbols=target_vocab_size,
                 embedding_size=size,
+                bucket_index=bucket_id,
                 output_projection=output_projection,
-                feed_previous=do_decode,
-                dtype=dtype)
+                feed_previous=do_decode)
 
         # Feeds for inputs.
         self.encoder_inputs = []
@@ -143,25 +203,43 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
         targets = [self.decoder_inputs[i + 1]
                    for i in range(len(self.decoder_inputs) - 1)]
 
-        # Training outputs and losses.
-        if forward_only:
-            self.outputs, self.losses = model_with_buckets(
-                self.encoder_inputs, self.decoder_inputs, targets,
-                self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
-                softmax_loss_function=softmax_loss_function)
-            # If we use output projection, we need to project outputs for decoding.
-            if output_projection is not None:
-                for b in range(len(buckets)):
-                    self.outputs[b] = [
-                        tf.matmul(output, output_projection[0]) + output_projection[1]
-                        for output in self.outputs[b]
-                        ]
-        else:
-            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
-                self.encoder_inputs, self.decoder_inputs, targets,
-                self.target_weights, buckets,
-                lambda x, y: seq2seq_f(x, y, False),
-                softmax_loss_function=softmax_loss_function)
+        with tf.variable_scope("model_with_buckets"):
+            self.losses = []
+            self.outputs = []
+            self.decoder_out_state = []
+            for bucket_idx, bucket in enumerate(buckets):
+                with tf.variable_scope(tf.get_variable_scope(), reuse=True if bucket_idx > 0 else None):
+                    if forward_only:
+                        bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]],
+                                                      self.decoder_inputs[:bucket[1]],
+                                                      True,
+                                                      bucket_idx)
+                    else:
+                        bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]],
+                                                      self.decoder_inputs[:bucket[1]],
+                                                      False,
+                                                      bucket_idx)
+                    self.outputs.append(bucket_outputs)
+                    self.decoder_out_state.append(bucket_outputs_state)
+                    self.losses.append(
+                        sequence_loss(
+                            self.outputs[-1],
+                            targets[:bucket[1]],
+                            self.target_weights[:bucket[1]],
+                            softmax_loss_function=softmax_loss_function
+                        )
+                    )
+
+        if forward_only and output_projection is not None:
+            for b in range(len(buckets)):
+                self.outputs[b] = [
+                    tf.matmul(output, output_projection[0]) + output_projection[1]
+                    for output in self.outputs[b]]
+                #best_outputs = [tf.argmax(x,1) for x in self.outputs[b]]
+                #best_outputs = tf.concat(axis=1, values=[tf.reshape(x, [self.batch_size, 1]) for x in best_outputs])
+                _topk_log_probs, _topk_ids = tf.nn.top_k(tf.nn.softmax(self.outputs[b][-1]), beam_search_size)
+                self.topk_probs.append(_topk_log_probs)
+                self.topk_ids.append(_topk_ids)
 
         # Gradients and SGD update operation for training the model.
         params = tf.trainable_variables()
@@ -179,24 +257,84 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
 
         self.saver = tf.train.Saver(tf.global_variables())
 
+    def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weights,
+                         bucket_id, forward_only, use_beam_search=True):
+        encoder_size, decoder_size = self.buckets[bucket_id]
+        if len(encoder_inputs) != encoder_size:
+            raise ValueError("Encoder length must be equal to the one in bucket,"
+                             " %d != %d." % (len(encoder_inputs), encoder_size))
+        if len(decoder_inputs) != decoder_size:
+            raise ValueError("Decoder length must be equal to the one in bucket,"
+                             " %d != %d." % (len(decoder_inputs), decoder_size))
+        if len(target_weights) != decoder_size:
+            raise ValueError("Weights length must be equal to the one in bucket,"
+                             " %d != %d." % (len(target_weights), decoder_size))
+
+        input_feed = {}
+        for l in range(encoder_size):
+            input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
+        for l in range(decoder_size):
+            input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
+            input_feed[self.target_weights[l].name] = target_weights[l]
+
+        last_target = self.decoder_inputs[decoder_size].name
+        input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
+
+        if not forward_only:
+            raise NotImplementedError("Not Implemented!!!")
+        else:
+            if use_beam_search:
+                output_feed = [self.model_attention_states[bucket_id],
+                               self.model_encoder_states[bucket_id]]
+                outputs = session.run(output_feed, input_feed) # attention_state, encoder_state
+                beams = [(0.0, [data_utils.GO_ID], data_utils.GO_ID, outputs[1])] * 3 # score, result, last_token, encoder_state
+                result = []
+                step = 0
+                attention_state = outputs[0]
+
+                while step < decoder_size and len(result) < self.beam_search_size:
+                    step += 1
+                    _last_tokens = [beam_[2] for beam_ in beams]
+                    _encoder_state = [beam_[3] for beam_ in beams]
+                    output_feed = [self.topk_ids[bucket_id], self.topk_probs[bucket_id], self.decoder_out_state[bucket_id]]
+                    input_feed = {}
+                    input_feed[self.model_attention_states[bucket_id].name] = attention_state
+                    input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state))
+                    for l in range(step):
+                        _decoder_inputs = [beam_[1][l] for beam_ in beams]
+                        input_feed[self.decoder_inputs[l].name] = _decoder_inputs
+
+                    _tok_ids, _tok_probs, _out_states = session.run(output_feed, input_feed)
+
+                    new_beams = []
+
+                    for beam_idx in range(self.beam_search_size):
+                        for _idx in range(self.beam_search_size):
+                            new_beams.append((beams[beam_idx][0]+_tok_probs[beam_idx][_idx], beams[beam_idx][1]+[_tok_ids[beam_idx][_idx]], _tok_ids[beam_idx][_idx], _out_states[beam_idx]))
+
+                    new_beams.sort(key=lambda x: x[0], reverse=True)
+                    beams = []
+                    for beam_ in new_beams:
+                        if beam_[2] == data_utils.EOS_ID:
+                            result.append((beam_[0],beam_[1][:-1],beam_[2],beam_[3]))
+                        else:
+                            beams.append(beam_)
+                            if len(beams) == self.beam_search_size:
+                                break
+
+                    if step == decoder_size:
+                        for beam_ in beams:
+                            result.append(beam_)
+                            if len(result) == self.beam_search_size:
+                                break
+                return None, None, result[0][1]
+
+            else:
+                raise NotImplementedError("Not Implemented!!!")
+
+
     def step(self, session, encoder_inputs, decoder_inputs, target_weights,
              bucket_id, forward_only):
-        """Run a step of the model feeding the given inputs.
-        Args:
-          session: tensorflow session to use.
-          encoder_inputs: list of numpy int vectors to feed as encoder inputs.
-          decoder_inputs: list of numpy int vectors to feed as decoder inputs.
-          target_weights: list of numpy float vectors to feed as target weights.
-          bucket_id: which bucket of the model to use.
-          forward_only: whether to do the backward step or only forward.
-        Returns:
-          A triple consisting of gradient norm (or None if we did not do backward),
-          average perplexity, and the outputs.
-        Raises:
-          ValueError: if length of encoder_inputs, decoder_inputs, or
-            target_weights disagrees with bucket size for the specified bucket_id.
-        """
-        # Check if the sizes match.
         encoder_size, decoder_size = self.buckets[bucket_id]
         if len(encoder_inputs) != encoder_size:
             raise ValueError("Encoder length must be equal to the one in bucket,"
@@ -208,7 +346,6 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights,
             raise ValueError("Weights length must be equal to the one in bucket,"
                              " %d != %d." % (len(target_weights), decoder_size))
 
-        # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
         input_feed = {}
         for l in range(encoder_size):
             input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
@@ -216,11 +353,9 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights,
             input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
             input_feed[self.target_weights[l].name] = target_weights[l]
 
-        # Since our targets are decoder inputs shifted by one, we need one more.
         last_target = self.decoder_inputs[decoder_size].name
         input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
 
-        # Output feed: depends on whether we do a backward step or not.
         if not forward_only:
             output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
                            self.gradient_norms[bucket_id],  # Gradient norm.
@@ -231,61 +366,41 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights,
                 output_feed.append(self.outputs[bucket_id][l])
 
         outputs = session.run(output_feed, input_feed)
+
         if not forward_only:
             return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
         else:
             return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
 
     def get_batch(self, data, bucket_id):
-        """Get a random batch of data from the specified bucket, prepare for step.
-        To feed data in step(..) it must be a list of batch-major vectors, while
-        data here contains single length-major cases. So the main logic of this
-        function is to re-index data cases to be in the proper format for feeding.
-        Args:
-          data: a tuple of size len(self.buckets) in which each element contains
-            lists of pairs of input and output data that we use to create a batch.
-          bucket_id: integer, which bucket to get the batch for.
-        Returns:
-          The triple (encoder_inputs, decoder_inputs, target_weights) for
-          the constructed batch that has the proper format to call step(...) later.
-        """
+
         encoder_size, decoder_size = self.buckets[bucket_id]
         encoder_inputs, decoder_inputs = [], []
 
-        # Get a random batch of encoder and decoder inputs from data,
-        # pad them if needed, reverse encoder inputs and add GO to decoder.
         for _ in range(self.batch_size):
             encoder_input, decoder_input = random.choice(data[bucket_id])
 
-            # Encoder inputs are padded and then reversed.
             encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
             encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
 
-            # Decoder inputs get an extra "GO" symbol, and are padded then.
             decoder_pad_size = decoder_size - len(decoder_input) - 1
             decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                                   [data_utils.PAD_ID] * decoder_pad_size)
 
-        # Now we create batch-major vectors from the data selected above.
         batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
 
-        # Batch encoder inputs are just re-indexed encoder_inputs.
         for length_idx in range(encoder_size):
             batch_encoder_inputs.append(
                 np.array([encoder_inputs[batch_idx][length_idx]
                           for batch_idx in range(self.batch_size)], dtype=np.int32))
 
-        # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
         for length_idx in range(decoder_size):
             batch_decoder_inputs.append(
                 np.array([decoder_inputs[batch_idx][length_idx]
                           for batch_idx in range(self.batch_size)], dtype=np.int32))
 
-            # Create target_weights to be 0 for targets that are padding.
             batch_weight = np.ones(self.batch_size, dtype=np.float32)
             for batch_idx in range(self.batch_size):
-                # We set weight to 0 if the corresponding target is a PAD symbol.
-                # The corresponding target is decoder_input shifted by 1 forward.
                 if length_idx < decoder_size - 1:
                     target = decoder_inputs[batch_idx][length_idx + 1]
                 if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 0965ce9..79e8659 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -24,6 +24,7 @@ def create_model(session, forward_only):
         learning_rate=FLAGS.learning_rate,
         learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
         use_lstm=False,
+        beam_search_size=FLAGS.beam_search_size,
         forward_only=forward_only)
 
     ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
@@ -37,7 +38,7 @@ def create_model(session, forward_only):
         session.run(tf.global_variables_initializer())
     return model
 
-def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess):
+def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_beam_search=False):
     input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab)
 
     bucket_id = min([b for b in range(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)])
@@ -46,17 +47,20 @@ def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess):
     feed_data = {bucket_id: [(input_token_ids, outputs)]}
     encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id)
 
-    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
+    if use_beam_search:
+        _, _, output_words = model.step_beam_search(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
+        outputs = output_words[1:]
+        output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs])
 
-    outputs = []
-
-    for logit in output_logits:
-        selected_token_id = int(np.argmax(logit, axis=1))
-        if selected_token_id == data_utils.EOS_ID:
-            break
-        else:
-            outputs.append(selected_token_id)
-
-    output_sentence = ' '.join([rev_vocab[output] for output in outputs])
+    else:
+        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
+        outputs = []
+        for logit in output_logits:
+            selected_token_id = int(np.argmax(logit, axis=1))
+            if selected_token_id == data_utils.EOS_ID:
+                break
+            else:
+                outputs.append(selected_token_id)
+        output_sentence = ' '.join([rev_vocab[output] for output in outputs])
 
     return output_sentence
\ No newline at end of file

From 7402ce99794c22c4514e8738a654667e0625aced Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Fri, 12 May 2017 15:59:51 +0800
Subject: [PATCH 02/14] Convert to PEP8 format.

---
 tf_chatbot/lib/basic/advanced_seq2seq.py | 420 -----------------------
 tf_chatbot/lib/chat.py                   |  11 +-
 tf_chatbot/lib/data_utils.py             |  70 ++--
 tf_chatbot/lib/predict.py                |  14 +-
 tf_chatbot/lib/seq2seq_model.py          | 314 +++++++++--------
 tf_chatbot/lib/seq2seq_model_utils.py    |  31 +-
 tf_chatbot/lib/train.py                  |  49 ++-
 7 files changed, 303 insertions(+), 606 deletions(-)
 delete mode 100644 tf_chatbot/lib/basic/advanced_seq2seq.py

diff --git a/tf_chatbot/lib/basic/advanced_seq2seq.py b/tf_chatbot/lib/basic/advanced_seq2seq.py
deleted file mode 100644
index 9812072..0000000
--- a/tf_chatbot/lib/basic/advanced_seq2seq.py
+++ /dev/null
@@ -1,420 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# We disable pylint because we need python3 compatibility.
-# from six.moves import xrange  # pylint: disable=redefined-builtin
-# from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.rnn.python.ops import core_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import embedding_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import nest
-from tensorflow import multinomial, squeeze
-import tensorflow as tf
-
-# TODO(ebrevdo): Remove once _linear is fully deprecated.
-linear = core_rnn_cell_impl._linear  # pylint: disable=protected-access
-
-
-def _extract_sample_and_embed(embedding,
-                              output_projection=None,
-                              update_embedding=True):
-    def loop_function(prev, _):
-        if output_projection is not None:
-            prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1])
-            # prev_symbol = math_ops.argmax(prev, 1)
-            prev_symbol = squeeze(multinomial(prev, 1), axis=1)
-        emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol)
-        if not update_embedding:
-            emb_prev = array_ops.stop_gradient(emb_prev)
-        return emb_prev
-
-    return loop_function
-
-
-def embedding_attention_sampled_seq2seq(encoder_inputs,
-                                        decoder_inputs,
-                                        cell,
-                                        num_encoder_symbols,
-                                        num_decoder_symbols,
-                                        embedding_size,
-                                        num_heads=1,
-                                        output_projection=None,
-                                        feed_previous=False,
-                                        dtype=None,
-                                        scope=None,
-                                        initial_state_attention=False,
-                                        batch_size=None):
-    """Embedding sequence-to-sequence model with attention.
-
-    This model first embeds encoder_inputs by a newly created embedding (of shape
-    [num_encoder_symbols x input_size]). Then it runs an RNN to encode
-    embedded encoder_inputs into a state vector. It keeps the outputs of this
-    RNN at every step to use for attention later. Next, it embeds decoder_inputs
-    by another newly created embedding (of shape [num_decoder_symbols x
-    input_size]). Then it runs attention decoder, initialized with the last
-    encoder state, on embedded decoder_inputs and attending to encoder outputs.
-
-    Warning: when output_projection is None, the size of the attention vectors
-    and variables will be made proportional to num_decoder_symbols, can be large.
-
-    Args:
-      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
-      cell: core_rnn_cell.RNNCell defining the cell function and size.
-      num_encoder_symbols: Integer; number of symbols on the encoder side.
-      num_decoder_symbols: Integer; number of symbols on the decoder side.
-      embedding_size: Integer, the length of the embedding vector for each symbol.
-      num_heads: Number of attention heads that read from attention_states.
-      output_projection: None or a pair (W, B) of output projection weights and
-        biases; W has shape [output_size x num_decoder_symbols] and B has
-        shape [num_decoder_symbols]; if provided and feed_previous=True, each
-        fed previous output will first be multiplied by W and added B.
-      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
-        of decoder_inputs will be used (the "GO" symbol), and all other decoder
-        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
-        If False, decoder_inputs are used as given (the standard decoder case).
-      dtype: The dtype of the initial RNN state (default: tf.float32).
-      scope: VariableScope for the created subgraph; defaults to
-        "embedding_attention_seq2seq".
-      initial_state_attention: If False (default), initial attentions are zero.
-        If True, initialize the attentions from the initial state and attention
-        states.
-
-    Returns:
-      A tuple of the form (outputs, state), where:
-        outputs: A list of the same length as decoder_inputs of 2D Tensors with
-          shape [batch_size x num_decoder_symbols] containing the generated
-          outputs.
-        state: The state of each decoder cell at the final time-step.
-          It is a 2D Tensor of shape [batch_size x cell.state_size].
-    """
-    with variable_scope.variable_scope(
-                    scope or "embedding_attention_seq2seq", dtype=dtype) as scope:
-        dtype = scope.dtype
-        # Encoder.
-        encoder_cell = core_rnn_cell.EmbeddingWrapper(
-            cell,
-            embedding_classes=num_encoder_symbols,
-            embedding_size=embedding_size)
-        encoder_outputs, encoder_state = core_rnn.static_rnn(
-            encoder_cell, encoder_inputs, dtype=dtype)
-
-        # First calculate a concatenation of encoder outputs to put attention on.
-        top_states = [
-            array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs
-            ]
-        attention_states = array_ops.concat(top_states, 1)
-
-        # Decoder.
-        output_size = None
-        if output_projection is None:
-            cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
-            output_size = num_decoder_symbols
-
-        if isinstance(feed_previous, bool):
-            return embedding_attention_decoder(
-                decoder_inputs,
-                encoder_state,
-                attention_states,
-                cell,
-                num_decoder_symbols,
-                embedding_size,
-                num_heads=num_heads,
-                output_size=output_size,
-                output_projection=output_projection,
-                feed_previous=feed_previous,
-                initial_state_attention=initial_state_attention)
-
-        else:
-            # If feed_previous is a Tensor, we construct 2 graphs and use cond.
-            def decoder(feed_previous_bool):
-                reuse = None if feed_previous_bool else True
-                with variable_scope.variable_scope(
-                        variable_scope.get_variable_scope(), reuse=reuse) as scope:
-                    outputs, state = embedding_attention_decoder(
-                        decoder_inputs,
-                        encoder_state,
-                        attention_states,
-                        cell,
-                        num_decoder_symbols,
-                        embedding_size,
-                        num_heads=num_heads,
-                        output_size=output_size,
-                        output_projection=output_projection,
-                        feed_previous=feed_previous_bool,
-                        update_embedding_for_previous=False,
-                        initial_state_attention=initial_state_attention)
-                    state_list = [state]
-                    if nest.is_sequence(state):
-                        state_list = nest.flatten(state)
-                    return outputs + state_list
-
-            outputs_and_state = control_flow_ops.cond(feed_previous,
-                                                      lambda: decoder(True),
-                                                      lambda: decoder(False))
-            outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
-            state_list = outputs_and_state[outputs_len:]
-            state = state_list[0]
-            if nest.is_sequence(encoder_state):
-                state = nest.pack_sequence_as(
-                    structure=encoder_state, flat_sequence=state_list)
-            return outputs_and_state[:outputs_len], state
-
-
-def embedding_attention_decoder(decoder_inputs,
-                                initial_state,
-                                attention_states,
-                                cell,
-                                num_symbols,
-                                embedding_size,
-                                num_heads=1,
-                                output_size=None,
-                                output_projection=None,
-                                feed_previous=False,
-                                update_embedding_for_previous=True,
-                                dtype=None,
-                                scope=None,
-                                initial_state_attention=False):
-    """RNN decoder with embedding and attention and a pure-decoding option.
-
-    Args:
-      decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs).
-      initial_state: 2D Tensor [batch_size x cell.state_size].
-      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-      cell: core_rnn_cell.RNNCell defining the cell function.
-      num_symbols: Integer, how many symbols come into the embedding.
-      embedding_size: Integer, the length of the embedding vector for each symbol.
-      num_heads: Number of attention heads that read from attention_states.
-      output_size: Size of the output vectors; if None, use output_size.
-      output_projection: None or a pair (W, B) of output projection weights and
-        biases; W has shape [output_size x num_symbols] and B has shape
-        [num_symbols]; if provided and feed_previous=True, each fed previous
-        output will first be multiplied by W and added B.
-      feed_previous: Boolean; if True, only the first of decoder_inputs will be
-        used (the "GO" symbol), and all other decoder inputs will be generated by:
-          next = embedding_lookup(embedding, argmax(previous_output)),
-        In effect, this implements a greedy decoder. It can also be used
-        during training to emulate http://arxiv.org/abs/1506.03099.
-        If False, decoder_inputs are used as given (the standard decoder case).
-      update_embedding_for_previous: Boolean; if False and feed_previous=True,
-        only the embedding for the first symbol of decoder_inputs (the "GO"
-        symbol) will be updated by back propagation. Embeddings for the symbols
-        generated from the decoder itself remain unchanged. This parameter has
-        no effect if feed_previous=False.
-      dtype: The dtype to use for the RNN initial states (default: tf.float32).
-      scope: VariableScope for the created subgraph; defaults to
-        "embedding_attention_decoder".
-      initial_state_attention: If False (default), initial attentions are zero.
-        If True, initialize the attentions from the initial state and attention
-        states -- useful when we wish to resume decoding from a previously
-        stored decoder state and attention states.
-
-    Returns:
-      A tuple of the form (outputs, state), where:
-        outputs: A list of the same length as decoder_inputs of 2D Tensors with
-          shape [batch_size x output_size] containing the generated outputs.
-        state: The state of each decoder cell at the final time-step.
-          It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-    Raises:
-      ValueError: When output_projection has the wrong shape.
-    """
-    if output_size is None:
-        output_size = cell.output_size
-    if output_projection is not None:
-        proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype)
-        proj_biases.get_shape().assert_is_compatible_with([num_symbols])
-
-    with variable_scope.variable_scope(
-                    scope or "embedding_attention_decoder", dtype=dtype) as scope:
-
-        embedding = variable_scope.get_variable("embedding",
-                                                [num_symbols, embedding_size])
-        loop_function = _extract_sample_and_embed(
-            embedding, output_projection,
-            update_embedding_for_previous) if feed_previous else None
-        emb_inp = [
-            embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs
-            ]
-        return attention_decoder(
-            emb_inp,
-            initial_state,
-            attention_states,
-            cell,
-            output_size=output_size,
-            num_heads=num_heads,
-            loop_function=loop_function,
-            initial_state_attention=initial_state_attention)
-
-
-def attention_decoder(decoder_inputs,
-                      initial_state,
-                      attention_states,
-                      cell,
-                      output_size=None,
-                      num_heads=1,
-                      loop_function=None,
-                      dtype=None,
-                      scope=None,
-                      initial_state_attention=False):
-    """RNN decoder with attention for the sequence-to-sequence model.
-
-    In this context "attention" means that, during decoding, the RNN can look up
-    information in the additional tensor attention_states, and it does this by
-    focusing on a few entries from the tensor. This model has proven to yield
-    especially good results in a number of sequence-to-sequence tasks. This
-    implementation is based on http://arxiv.org/abs/1412.7449 (see below for
-    details). It is recommended for complex sequence-to-sequence tasks.
-
-    Args:
-      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
-      initial_state: 2D Tensor [batch_size x cell.state_size].
-      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
-      cell: core_rnn_cell.RNNCell defining the cell function and size.
-      output_size: Size of the output vectors; if None, we use cell.output_size.
-      num_heads: Number of attention heads that read from attention_states.
-      loop_function: If not None, this function will be applied to i-th output
-        in order to generate i+1-th input, and decoder_inputs will be ignored,
-        except for the first element ("GO" symbol). This can be used for decoding,
-        but also for training to emulate http://arxiv.org/abs/1506.03099.
-        Signature -- loop_function(prev, i) = next
-          * prev is a 2D Tensor of shape [batch_size x output_size],
-          * i is an integer, the step number (when advanced control is needed),
-          * next is a 2D Tensor of shape [batch_size x input_size].
-      dtype: The dtype to use for the RNN initial state (default: tf.float32).
-      scope: VariableScope for the created subgraph; default: "attention_decoder".
-      initial_state_attention: If False (default), initial attentions are zero.
-        If True, initialize the attentions from the initial state and attention
-        states -- useful when we wish to resume decoding from a previously
-        stored decoder state and attention states.
-
-    Returns:
-      A tuple of the form (outputs, state), where:
-        outputs: A list of the same length as decoder_inputs of 2D Tensors of
-          shape [batch_size x output_size]. These represent the generated outputs.
-          Output i is computed from input i (which is either the i-th element
-          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
-          First, we run the cell on a combination of the input and previous
-          attention masks:
-            cell_output, new_state = cell(linear(input, prev_attn), prev_state).
-          Then, we calculate new attention masks:
-            new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
-          and then we calculate the output:
-            output = linear(cell_output, new_attn).
-        state: The state of each decoder cell the final time-step.
-          It is a 2D Tensor of shape [batch_size x cell.state_size].
-
-    Raises:
-      ValueError: when num_heads is not positive, there are no inputs, shapes
-        of attention_states are not set, or input size cannot be inferred
-        from the input.
-    """
-    if not decoder_inputs:
-        raise ValueError("Must provide at least 1 input to attention decoder.")
-    if num_heads < 1:
-        raise ValueError("With less than 1 heads, use a non-attention decoder.")
-    if attention_states.get_shape()[2].value is None:
-        raise ValueError("Shape[2] of attention_states must be known: %s" %
-                         attention_states.get_shape())
-    if output_size is None:
-        output_size = cell.output_size
-
-    with variable_scope.variable_scope(
-                    scope or "attention_decoder", dtype=dtype) as scope:
-        dtype = scope.dtype
-
-        batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
-        attn_length = attention_states.get_shape()[1].value
-        if attn_length is None:
-            attn_length = array_ops.shape(attention_states)[1]
-        attn_size = attention_states.get_shape()[2].value
-
-        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
-        hidden = array_ops.reshape(attention_states,
-                                   [-1, attn_length, 1, attn_size])
-        hidden_features = []
-        v = []
-        attention_vec_size = attn_size  # Size of query vectors for attention.
-        for a in xrange(num_heads):
-            k = variable_scope.get_variable("AttnW_%d" % a,
-                                            [1, 1, attn_size, attention_vec_size])
-            hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
-            v.append(
-                variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))
-
-        state = initial_state
-
-        def attention(query):
-            """Put attention masks on hidden using hidden_features and query."""
-            ds = []  # Results of attention reads will be stored here.
-            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
-                query_list = nest.flatten(query)
-                for q in query_list:  # Check that ndims == 2 if specified.
-                    ndims = q.get_shape().ndims
-                    if ndims:
-                        assert ndims == 2
-                query = array_ops.concat(query_list, 1)
-            for a in xrange(num_heads):
-                with variable_scope.variable_scope("Attention_%d" % a):
-                    y = linear(query, attention_vec_size, True)
-                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
-                    # Attention mask is a softmax of v^T * tanh(...).
-                    s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y),
-                                            [2, 3])
-                    a = nn_ops.softmax(s)
-                    # Now calculate the attention-weighted vector d.
-                    d = math_ops.reduce_sum(
-                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
-                    ds.append(array_ops.reshape(d, [-1, attn_size]))
-            return ds
-
-        outputs = []
-        prev = None
-        batch_attn_size = array_ops.stack([batch_size, attn_size])
-        attns = [
-            array_ops.zeros(
-                batch_attn_size, dtype=dtype) for _ in xrange(num_heads)
-            ]
-        for a in attns:  # Ensure the second shape of attention vectors is set.
-            a.set_shape([None, attn_size])
-        if initial_state_attention:
-            attns = attention(initial_state)
-        for i, inp in enumerate(decoder_inputs):
-            if i > 0:
-                variable_scope.get_variable_scope().reuse_variables()
-            # If loop_function is set, we use it instead of decoder_inputs.
-            if loop_function is not None and prev is not None:
-                with variable_scope.variable_scope("loop_function", reuse=True):
-                    inp = loop_function(prev, i)
-            # Merge input and previous attentions into one vector of the right size.
-            input_size = inp.get_shape().with_rank(2)[1]
-            if input_size.value is None:
-                raise ValueError("Could not infer input size from input: %s" % inp.name)
-            x = linear([inp] + attns, input_size, True)
-            # Run the RNN.
-            cell_output, state = cell(x, state)
-            # Run the attention mechanism.
-            if i == 0 and initial_state_attention:
-                with variable_scope.variable_scope(
-                        variable_scope.get_variable_scope(), reuse=True):
-                    attns = attention(state)
-            else:
-                attns = attention(state)
-
-            with variable_scope.variable_scope("AttnOutputProjection"):
-                output = linear([cell_output] + attns, output_size, True)
-            if loop_function is not None:
-                prev = output
-            outputs.append(output)
-
-    return outputs, state
diff --git a/tf_chatbot/lib/chat.py b/tf_chatbot/lib/chat.py
index 094737e..18a5c65 100644
--- a/tf_chatbot/lib/chat.py
+++ b/tf_chatbot/lib/chat.py
@@ -7,13 +7,17 @@
 from tf_chatbot.lib import data_utils
 from tf_chatbot.lib.seq2seq_model_utils import create_model, get_predicted_sentence
 
+
 def chat():
     with tf.Session() as sess:
 
         model = create_model(sess, forward_only=True)
         model.batch_size = 1
 
-        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size)
+        vocab_path = os.path.join(
+            FLAGS.data_dir,
+            "vocab%d.in" %
+            FLAGS.vocab_size)
         vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
 
         sys.stdout.write("> ")
@@ -21,8 +25,9 @@ def chat():
         sentence = sys.stdin.readline()
 
         while sentence:
-            predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess)
+            predicted_sentence = get_predicted_sentence(
+                sentence, vocab, rev_vocab, model, sess)
             print(predicted_sentence)
             print("> ")
             sys.stdout.flush()
-            sentence = sys.stdin.readline()
\ No newline at end of file
+            sentence = sys.stdin.readline()
diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py
index cb976ee..e823e33 100644
--- a/tf_chatbot/lib/data_utils.py
+++ b/tf_chatbot/lib/data_utils.py
@@ -28,22 +28,28 @@
 
 _ENCODING = "utf8"
 
+
 def get_dialog_train_set_path(path):
     return os.path.join(path, 'train_data')
 
+
 def get_dialog_dev_set_path(path):
     return os.path.join(path, 'dev_data')
 
+
 def basic_tokenizer(sentence):
     words = []
     for space_separated_fragment in sentence.strip().split():
         words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
     return [w.lower() for w in words if w]
 
+
 def create_vocabulary_bak(vocabulary_path, data_path, max_vocabulary_size,
-                      tokenizer=None, normalize_digits=True):
+                          tokenizer=None, normalize_digits=True):
     if not gfile.Exists(vocabulary_path):
-        print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
+        print(
+            "Creating vocabulary %s from data %s" %
+            (vocabulary_path, data_path))
         vocab = {}
         with gfile.GFile(data_path, mode='r') as f:
             counter = 0
@@ -51,28 +57,33 @@ def create_vocabulary_bak(vocabulary_path, data_path, max_vocabulary_size,
                 counter += 1
                 if counter % 100000 == 0:
                     print("  processing line %d" % counter)
-                tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
+                tokens = tokenizer(
+                    line) if tokenizer else basic_tokenizer(line)
                 for w in tokens:
                     word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w
                     if word in vocab:
                         vocab[word] += 1
                     else:
                         vocab[word] = 1
-            vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
+            vocab_list = _START_VOCAB + \
+                sorted(vocab, key=vocab.get, reverse=True)
             if len(vocab_list) > max_vocabulary_size:
                 vocab_list = vocab_list[:max_vocabulary_size]
             with gfile.GFile(vocabulary_path, mode='w') as vocab_file:
                 for w in vocab_list:
                     vocab_file.write(w + '\n')
 
+
 def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
-                            tokenizer=None, normalize_digits=True):
+                      tokenizer=None, normalize_digits=True):
     if not gfile.Exists(vocabulary_path):
-        print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
+        print(
+            "Creating vocabulary %s from data %s" %
+            (vocabulary_path, data_path))
         vocab = {}
         data = json.load(open(data_path), encoding=_ENCODING)
         counter = 0
-        for ((q,qe),(a,ae)) in data:
+        for ((q, qe), (a, ae)) in data:
             counter += 1
             if counter % 50000 == 0:
                 print("  Create_vocabulary: processing line %d" % counter)
@@ -110,11 +121,13 @@ def initialize_vocabulary(vocabulary_path):
             rev_vocab.extend(f.readlines())
 
         rev_vocab = [line.strip() for line in rev_vocab]
-        vocab = dict([(x,y) for (y,x) in enumerate(rev_vocab)])     # {'word':index}
+        vocab = dict([(x, y)
+                      for (y, x) in enumerate(rev_vocab)])     # {'word':index}
         return vocab, rev_vocab
     else:
         raise ValueError("Vocabulary file %s not found" % vocabulary_path)
 
+
 def sentence_to_token_ids(sentence, vocabulary,
                           tokenizer=None, normalize_digits=True):
     if tokenizer:
@@ -123,10 +136,17 @@ def sentence_to_token_ids(sentence, vocabulary,
         words = basic_tokenizer(sentence)
     if not normalize_digits:
         return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words]
-    return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words]
+    return [
+        vocabulary.get(
+            re.sub(
+                _DIGIT_RE,
+                "0",
+                w.encode('utf8')),
+            UNK_ID) for w in words]
+
 
 def data_to_token_ids_bak(data_path, target_path, vocabulary_path,
-                      tokenizer=None, normalize_digits=True):
+                          tokenizer=None, normalize_digits=True):
     if not gfile.Exists(target_path):
         print("Tokenizing data in %s" % data_path)
         vocab, _ = initialize_vocabulary(vocabulary_path)
@@ -139,31 +159,38 @@ def data_to_token_ids_bak(data_path, target_path, vocabulary_path,
                         print("  tokenizing line %d" % counter)
                     token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                                       normalize_digits)
-                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + '\n')
+                    tokens_file.write(
+                        " ".join([str(tok) for tok in token_ids]) + '\n')
+
 
 def data_to_token_ids(data_path, target_path, vocabulary_path,
-                            tokenizer=None, normalize_digits=True):
+                      tokenizer=None, normalize_digits=True):
     if not gfile.Exists(target_path):
         print("Tokenizing data in %s" % data_path)
         vocab, _ = initialize_vocabulary(vocabulary_path)
         with gfile.GFile(target_path, mode='w') as tokens_file:
             data = json.load(open(data_path), encoding=_ENCODING)
             counter = 0
-            for ((q,qe),(a,ae)) in data:
+            for ((q, qe), (a, ae)) in data:
                 counter += 1
                 if counter % 50000 == 0:
                     print("  Data_to_token_ids: tokenizing line %d" % counter)
-                token_ids_q = sentence_to_token_ids(q, vocab, tokenizer, normalize_digits)
-                tokens_file.write(" ".join([str(tok) for tok in token_ids_q]) + '\n')
-                token_ids_a = sentence_to_token_ids(a, vocab, tokenizer, normalize_digits)
-                tokens_file.write(" ".join([str(tok) for tok in token_ids_a]) + '\n')
+                token_ids_q = sentence_to_token_ids(
+                    q, vocab, tokenizer, normalize_digits)
+                tokens_file.write(" ".join([str(tok)
+                                            for tok in token_ids_q]) + '\n')
+                token_ids_a = sentence_to_token_ids(
+                    a, vocab, tokenizer, normalize_digits)
+                tokens_file.write(" ".join([str(tok)
+                                            for tok in token_ids_a]) + '\n')
+
 
 def prepare_dialog_data(data_dir, vocabulary_size):
     train_path = get_dialog_train_set_path(data_dir)
     dev_path = get_dialog_dev_set_path(data_dir)
 
     vocab_path = os.path.join(data_dir, "vocab%d.in" % vocabulary_size)
-    create_vocabulary(vocab_path, train_path+".json", vocabulary_size)
+    create_vocabulary(vocab_path, train_path + ".json", vocabulary_size)
 
     train_ids_path = train_path + (".ids%d.in" % vocabulary_size)
     data_to_token_ids(train_path + ".json", train_ids_path, vocab_path)
@@ -173,6 +200,7 @@ def prepare_dialog_data(data_dir, vocabulary_size):
 
     return (train_ids_path, dev_ids_path, vocab_path)
 
+
 def read_data(tokenized_dialog_path, max_size=None):
 
     data_set = [[] for _ in BUCKETS]
@@ -184,16 +212,16 @@ def read_data(tokenized_dialog_path, max_size=None):
             counter += 1
             if counter % 100000 == 0:
                 print("  reading data line %d" % counter)
-                #sys.stdout.flush()
+                # sys.stdout.flush()
 
             source_ids = [int(x) for x in source.split()]
             target_ids = [int(x) for x in target.split()]
             target_ids.append(EOS_ID)
 
             for bucket_id, (source_size, target_size) in enumerate(BUCKETS):
-                if len(source_ids) < source_size and len(target_ids) < target_size:
+                if len(source_ids) < source_size and len(
+                        target_ids) < target_size:
                     data_set[bucket_id].append([source_ids, target_ids])
                     break
             source, target = fh.readline(), fh.readline()
     return data_set
-
diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index 00b23f7..6f61c99 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -7,13 +7,15 @@
 from tf_chatbot.lib.seq2seq_model_utils import create_model, get_predicted_sentence
 import json
 
+
 def predict():
     def _get_test_dataset():
         data = json.load(open(TEST_DATASET_PATH))
         test_sentences = [q for ((q, qe), _) in data]
         return test_sentences
 
-    results_filename = '_'.join(['results', str(FLAGS.num_layers), str(FLAGS.size), str(FLAGS.vocab_size)])
+    results_filename = '_'.join(
+        ['results', str(FLAGS.num_layers), str(FLAGS.size), str(FLAGS.vocab_size)])
     results_path = os.path.join(FLAGS.results_dir, results_filename)
 
     with tf.Session() as sess, open(results_path, 'w') as results_fh:
@@ -21,14 +23,18 @@ def _get_test_dataset():
         model = create_model(sess, forward_only=True)
         model.batch_size = 1
 
-        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size)
+        vocab_path = os.path.join(
+            FLAGS.data_dir,
+            "vocab%d.in" %
+            FLAGS.vocab_size)
         vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
 
         test_dataset = _get_test_dataset()
 
         for sentence in test_dataset:
-            predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
+            predicted_sentence = get_predicted_sentence(
+                sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
             print(sentence, '->')
             print(predicted_sentence)
 
-            results_fh.write(predicted_sentence + '\n')
\ No newline at end of file
+            results_fh.write(predicted_sentence + '\n')
diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index 61ecde3..dce3fbc 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -5,29 +5,74 @@
 import random
 
 import numpy as np
-# from six.moves import xrange  # pylint: disable=redefined-builtin
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 import tf_chatbot.lib.data_utils as data_utils
-from tensorflow.contrib.legacy_seq2seq import model_with_buckets, sequence_loss, embedding_attention_decoder
-from tf_chatbot.lib.basic.advanced_seq2seq import embedding_attention_sampled_seq2seq
+from tensorflow.contrib.legacy_seq2seq import sequence_loss, attention_decoder
 from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper
 
 
-class Seq2SeqModel(object):
-    """Sequence-to-sequence model with attention and for multiple buckets.
-    This class implements a multi-layer recurrent neural network as encoder,
-    and an attention-based decoder. This is the same as the model described in
-    this paper: http://arxiv.org/abs/1412.7449 - please look there for details,
-    or into the seq2seq library for complete model implementation.
-    This class also allows to use GRU cells in addition to LSTM cells, and
-    sampled softmax to handle large output vocabulary size. A single-layer
-    version of this model, but with bi-directional encoder, was presented in
-      http://arxiv.org/abs/1409.0473
-    and sampled softmax is described in Section 3 of the following paper.
-      http://arxiv.org/abs/1412.2007
-    """
+def _extract_sample_and_embed(embedding,
+                              output_projection=None,
+                              update_embedding=True):
+
+    def loop_function(prev, _):
+        if output_projection is not None:
+            prev = tf.nn.xw_plus_b(
+                prev, output_projection[0], output_projection[1])
+            prev_symbol = tf.squeeze(tf.multinomial(prev, 1), axis=1)
+        emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
+        if not update_embedding:
+            emb_prev = tf.stop_gradient(emb_prev)
+        return emb_prev
+
+    return loop_function
+
+
+def embedding_attention_decoder(decoder_inputs,
+                                initial_state,
+                                attention_states,
+                                cell,
+                                num_symbols,
+                                embedding_size,
+                                num_heads=1,
+                                output_size=None,
+                                output_projection=None,
+                                feed_previous=False,
+                                update_embedding_for_previous=True,
+                                dtype=None,
+                                scope=None,
+                                initial_state_attention=False):
+    if output_size is None:
+        output_size = cell.output_size
+    if output_projection is not None:
+        proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype)
+        proj_biases.get_shape().assert_is_compatible_with([num_symbols])
+
+    with tf.variable_scope("embedding_attention_decoder", dtype=dtype):
+        embedding = tf.get_variable("embedding", [num_symbols, embedding_size])
+
+        loop_function = _extract_sample_and_embed(
+            embedding, output_projection,
+            update_embedding_for_previous) if feed_previous else None
+        emb_inp = [
+            tf.nn.embedding_lookup(
+                embedding,
+                i) for i in decoder_inputs]
+
+        return attention_decoder(
+            emb_inp,
+            initial_state,
+            attention_states,
+            cell,
+            output_size=output_size,
+            num_heads=num_heads,
+            loop_function=loop_function,
+            initial_state_attention=initial_state_attention)
+
 
+class Seq2SeqModel(object):
     def __init__(self,
                  source_vocab_size,
                  target_vocab_size,
@@ -43,28 +88,7 @@ def __init__(self,
                  forward_only=False,
                  beam_search_size=1,
                  dtype=tf.float32):
-        """Create the model.
-        Args:
-          source_vocab_size: size of the source vocabulary.
-          target_vocab_size: size of the target vocabulary.
-          buckets: a list of pairs (I, O), where I specifies maximum input length
-            that will be processed in that bucket, and O specifies maximum output
-            length. Training instances that have inputs longer than I or outputs
-            longer than O will be pushed to the next bucket and padded accordingly.
-            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
-          size: number of units in each layer of the model.
-          num_layers: number of layers in the model.
-          max_gradient_norm: gradients will be clipped to maximally this norm.
-          batch_size: the size of the batches used during training;
-            the model construction is independent of batch_size, so it can be
-            changed after initialization if this is convenient, e.g., for decoding.
-          learning_rate: learning rate to start with.
-          learning_rate_decay_factor: decay learning rate by this much when needed.
-          use_lstm: if true, we use LSTM cells instead of GRU cells.
-          num_samples: number of samples for sampled softmax.
-          forward_only: if set, we do not construct the backward pass in the model.
-          dtype: the data type to use to store internal variables.
-        """
+
         self.source_vocab_size = source_vocab_size
         self.target_vocab_size = target_vocab_size
         self.buckets = buckets
@@ -79,11 +103,16 @@ def __init__(self,
         # If we use sampled softmax, we need an output projection.
         output_projection = None
         softmax_loss_function = None
-        # Sampled softmax only makes sense if we sample less than vocabulary size.
+        # Sampled softmax only makes sense if we sample less than vocabulary
+        # size.
         if num_samples > 0 and num_samples < self.target_vocab_size:
-            w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
+            w_t = tf.get_variable(
+                "proj_w", [
+                    self.target_vocab_size, size], dtype=dtype)
             w = tf.transpose(w_t)
-            b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
+            b = tf.get_variable(
+                "proj_b", [
+                    self.target_vocab_size], dtype=dtype)
             output_projection = (w, b)
 
             def sampled_loss(labels, logits):
@@ -124,18 +153,19 @@ def single_cell():
         # The seq2seq function: we use embedding for the input and attention.
         def seq2seq_f(encoder_inputs, decoder_inputs, do_decode, bucket_id):
 
-            def embedding_attention_sampled_seq2seq(encoder_inputs,
-                                        decoder_inputs,
-                                        cell,
-                                        num_encoder_symbols,
-                                        num_decoder_symbols,
-                                        embedding_size,
-                                        bucket_index,
-                                        num_heads=1,
-                                        output_projection=None,
-                                        feed_previous=False,
-                                        initial_state_attention=False,
-                                        dtype=tf.float32):
+            def embedding_attention_sampled_seq2seq(
+                    encoder_inputs,
+                    decoder_inputs,
+                    cell,
+                    num_encoder_symbols,
+                    num_decoder_symbols,
+                    embedding_size,
+                    bucket_index,
+                    num_heads=1,
+                    output_projection=None,
+                    feed_previous=False,
+                    initial_state_attention=False,
+                    dtype=tf.float32):
                 with tf.variable_scope("embedding_attention_sampled_seq2seq"):
                     encoder_cell = EmbeddingWrapper(
                         cell,
@@ -145,9 +175,8 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
                     encoder_outputs, encoder_state = static_rnn(
                         encoder_cell, encoder_inputs, dtype=dtype)
 
-                    top_states = [
-                        tf.reshape(e, [-1,1, cell.output_size]) for e in encoder_outputs
-                    ]
+                    top_states = [tf.reshape(
+                        e, [-1, 1, cell.output_size]) for e in encoder_outputs]
                     attention_states = tf.concat(top_states, 1)
 
                     self.model_encoder_states[bucket_index] = encoder_state
@@ -155,14 +184,17 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
 
                     output_size = None
                     if output_projection is None:
-                        cell = OutputProjectionWrapper(cell, num_decoder_symbols)
+                        cell = OutputProjectionWrapper(
+                            cell, num_decoder_symbols)
                         output_size = num_decoder_symbols
 
                     if isinstance(feed_previous, bool):
-                        return  embedding_attention_decoder(
+                        return embedding_attention_decoder(
                             decoder_inputs,
-                            self.model_encoder_states[bucket_index], #encoder_state,
-                            self.model_attention_states[bucket_index], #attention_states,
+                            # encoder_state,
+                            self.model_encoder_states[bucket_index],
+                            # attention_states,
+                            self.model_attention_states[bucket_index],
                             cell,
                             num_decoder_symbols,
                             embedding_size,
@@ -191,13 +223,22 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
         self.decoder_inputs = []
         self.target_weights = []
         for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
-            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
-                                                      name="encoder{0}".format(i)))
+            self.encoder_inputs.append(
+                tf.placeholder(
+                    tf.int32,
+                    shape=[None],
+                    name="encoder{0}".format(i)))
         for i in range(buckets[-1][1] + 1):
-            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
-                                                      name="decoder{0}".format(i)))
-            self.target_weights.append(tf.placeholder(dtype, shape=[None],
-                                                      name="weight{0}".format(i)))
+            self.decoder_inputs.append(
+                tf.placeholder(
+                    tf.int32,
+                    shape=[None],
+                    name="decoder{0}".format(i)))
+            self.target_weights.append(
+                tf.placeholder(
+                    dtype,
+                    shape=[None],
+                    name="weight{0}".format(i)))
 
         # Our targets are decoder inputs shifted by one.
         targets = [self.decoder_inputs[i + 1]
@@ -211,14 +252,14 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
                 with tf.variable_scope(tf.get_variable_scope(), reuse=True if bucket_idx > 0 else None):
                     if forward_only:
                         bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]],
-                                                      self.decoder_inputs[:bucket[1]],
-                                                      True,
-                                                      bucket_idx)
+                                                                         self.decoder_inputs[:bucket[1]],
+                                                                         True,
+                                                                         bucket_idx)
                     else:
                         bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]],
-                                                      self.decoder_inputs[:bucket[1]],
-                                                      False,
-                                                      bucket_idx)
+                                                                         self.decoder_inputs[:bucket[1]],
+                                                                         False,
+                                                                         bucket_idx)
                     self.outputs.append(bucket_outputs)
                     self.decoder_out_state.append(bucket_outputs_state)
                     self.losses.append(
@@ -237,7 +278,8 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
                     for output in self.outputs[b]]
                 #best_outputs = [tf.argmax(x,1) for x in self.outputs[b]]
                 #best_outputs = tf.concat(axis=1, values=[tf.reshape(x, [self.batch_size, 1]) for x in best_outputs])
-                _topk_log_probs, _topk_ids = tf.nn.top_k(tf.nn.softmax(self.outputs[b][-1]), beam_search_size)
+                _topk_log_probs, _topk_ids = tf.nn.top_k(
+                    tf.nn.softmax(self.outputs[b][-1]), beam_search_size)
                 self.topk_probs.append(_topk_log_probs)
                 self.topk_ids.append(_topk_ids)
 
@@ -249,26 +291,39 @@ def embedding_attention_sampled_seq2seq(encoder_inputs,
             opt = tf.train.GradientDescentOptimizer(self.learning_rate)
             for b in range(len(buckets)):
                 gradients = tf.gradients(self.losses[b], params)
-                clipped_gradients, norm = tf.clip_by_global_norm(gradients,
-                                                                 max_gradient_norm)
+                clipped_gradients, norm = tf.clip_by_global_norm(
+                    gradients, max_gradient_norm)
                 self.gradient_norms.append(norm)
                 self.updates.append(opt.apply_gradients(
                     zip(clipped_gradients, params), global_step=self.global_step))
 
         self.saver = tf.train.Saver(tf.global_variables())
 
-    def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weights,
-                         bucket_id, forward_only, use_beam_search=True):
+    def step(
+            self,
+            session,
+            encoder_inputs,
+            decoder_inputs,
+            target_weights,
+            bucket_id,
+            forward_only,
+            use_beam_search=False):
         encoder_size, decoder_size = self.buckets[bucket_id]
         if len(encoder_inputs) != encoder_size:
-            raise ValueError("Encoder length must be equal to the one in bucket,"
-                             " %d != %d." % (len(encoder_inputs), encoder_size))
+            raise ValueError(
+                "Encoder length must be equal to the one in bucket,"
+                " %d != %d." %
+                (len(encoder_inputs), encoder_size))
         if len(decoder_inputs) != decoder_size:
-            raise ValueError("Decoder length must be equal to the one in bucket,"
-                             " %d != %d." % (len(decoder_inputs), decoder_size))
+            raise ValueError(
+                "Decoder length must be equal to the one in bucket,"
+                " %d != %d." %
+                (len(decoder_inputs), decoder_size))
         if len(target_weights) != decoder_size:
-            raise ValueError("Weights length must be equal to the one in bucket,"
-                             " %d != %d." % (len(target_weights), decoder_size))
+            raise ValueError(
+                "Weights length must be equal to the one in bucket,"
+                " %d != %d." %
+                (len(target_weights), decoder_size))
 
         input_feed = {}
         for l in range(encoder_size):
@@ -281,42 +336,65 @@ def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weigh
         input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
 
         if not forward_only:
-            raise NotImplementedError("Not Implemented!!!")
+            output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
+                           self.gradient_norms[bucket_id],  # Gradient norm.
+                           self.losses[bucket_id]]  # Loss for this batch.
+            outputs = session.run(output_feed, input_feed)
+
+            # Gradient norm, loss, no outputs.
+            return outputs[1], outputs[2], None
         else:
             if use_beam_search:
                 output_feed = [self.model_attention_states[bucket_id],
                                self.model_encoder_states[bucket_id]]
-                outputs = session.run(output_feed, input_feed) # attention_state, encoder_state
-                beams = [(0.0, [data_utils.GO_ID], data_utils.GO_ID, outputs[1])] * 3 # score, result, last_token, encoder_state
+                # attention_state, encoder_state
+                outputs = session.run(output_feed, input_feed)
+                # score, result, last_token, encoder_state
+                beams = [(0.0,
+                          [data_utils.GO_ID],
+                          data_utils.GO_ID,
+                          outputs[1])] * 3
                 result = []
                 step = 0
                 attention_state = outputs[0]
 
-                while step < decoder_size and len(result) < self.beam_search_size:
+                while step < decoder_size and len(
+                        result) < self.beam_search_size:
                     step += 1
                     _last_tokens = [beam_[2] for beam_ in beams]
                     _encoder_state = [beam_[3] for beam_ in beams]
-                    output_feed = [self.topk_ids[bucket_id], self.topk_probs[bucket_id], self.decoder_out_state[bucket_id]]
+                    output_feed = [
+                        self.topk_ids[bucket_id],
+                        self.topk_probs[bucket_id],
+                        self.decoder_out_state[bucket_id]]
                     input_feed = {}
                     input_feed[self.model_attention_states[bucket_id].name] = attention_state
-                    input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state))
+                    input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(
+                        np.array(_encoder_state))
                     for l in range(step):
                         _decoder_inputs = [beam_[1][l] for beam_ in beams]
                         input_feed[self.decoder_inputs[l].name] = _decoder_inputs
 
-                    _tok_ids, _tok_probs, _out_states = session.run(output_feed, input_feed)
+                    _tok_ids, _tok_probs, _out_states = session.run(
+                        output_feed, input_feed)
 
                     new_beams = []
 
                     for beam_idx in range(self.beam_search_size):
                         for _idx in range(self.beam_search_size):
-                            new_beams.append((beams[beam_idx][0]+_tok_probs[beam_idx][_idx], beams[beam_idx][1]+[_tok_ids[beam_idx][_idx]], _tok_ids[beam_idx][_idx], _out_states[beam_idx]))
+                            new_beams.append(
+                                (beams[beam_idx][0] + _tok_probs[beam_idx][_idx],
+                                 beams[beam_idx][1] + [
+                                    _tok_ids[beam_idx][_idx]],
+                                    _tok_ids[beam_idx][_idx],
+                                    _out_states[beam_idx]))
 
                     new_beams.sort(key=lambda x: x[0], reverse=True)
                     beams = []
                     for beam_ in new_beams:
                         if beam_[2] == data_utils.EOS_ID:
-                            result.append((beam_[0],beam_[1][:-1],beam_[2],beam_[3]))
+                            result.append(
+                                (beam_[0], beam_[1][:-1], beam_[2], beam_[3]))
                         else:
                             beams.append(beam_)
                             if len(beams) == self.beam_search_size:
@@ -327,50 +405,17 @@ def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weigh
                             result.append(beam_)
                             if len(result) == self.beam_search_size:
                                 break
-                return None, None, result[0][1]
-
-            else:
-                raise NotImplementedError("Not Implemented!!!")
-
-
-    def step(self, session, encoder_inputs, decoder_inputs, target_weights,
-             bucket_id, forward_only):
-        encoder_size, decoder_size = self.buckets[bucket_id]
-        if len(encoder_inputs) != encoder_size:
-            raise ValueError("Encoder length must be equal to the one in bucket,"
-                             " %d != %d." % (len(encoder_inputs), encoder_size))
-        if len(decoder_inputs) != decoder_size:
-            raise ValueError("Decoder length must be equal to the one in bucket,"
-                             " %d != %d." % (len(decoder_inputs), decoder_size))
-        if len(target_weights) != decoder_size:
-            raise ValueError("Weights length must be equal to the one in bucket,"
-                             " %d != %d." % (len(target_weights), decoder_size))
-
-        input_feed = {}
-        for l in range(encoder_size):
-            input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
-        for l in range(decoder_size):
-            input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
-            input_feed[self.target_weights[l].name] = target_weights[l]
 
-        last_target = self.decoder_inputs[decoder_size].name
-        input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
+                outputs = result[0]
+                return None, None, outputs[1]
 
-        if not forward_only:
-            output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
-                           self.gradient_norms[bucket_id],  # Gradient norm.
-                           self.losses[bucket_id]]  # Loss for this batch.
-        else:
-            output_feed = [self.losses[bucket_id]]  # Loss for this batch.
-            for l in range(decoder_size):  # Output logits.
-                output_feed.append(self.outputs[bucket_id][l])
-
-        outputs = session.run(output_feed, input_feed)
+            else:
+                output_feed = [self.losses[bucket_id]]  # Loss for this batch.
+                for l in range(decoder_size):  # Output logits.
+                    output_feed.append(self.outputs[bucket_id][l])
 
-        if not forward_only:
-            return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
-        else:
-            return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.
+                outputs = session.run(output_feed, input_feed)
+                return None, outputs[0], outputs[1:]
 
     def get_batch(self, data, bucket_id):
 
@@ -380,7 +425,8 @@ def get_batch(self, data, bucket_id):
         for _ in range(self.batch_size):
             encoder_input, decoder_input = random.choice(data[bucket_id])
 
-            encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
+            encoder_pad = [data_utils.PAD_ID] * \
+                (encoder_size - len(encoder_input))
             encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
 
             decoder_pad_size = decoder_size - len(decoder_input) - 1
diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 79e8659..10f9bb8 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -12,6 +12,7 @@
 
 _INDEX = ".index"
 
+
 def create_model(session, forward_only):
     model = seq2seq_model.Seq2SeqModel(
         source_vocab_size=FLAGS.vocab_size,
@@ -33,27 +34,41 @@ def create_model(session, forward_only):
         model.saver.restore(session, ckpt.model_checkpoint_path)
     else:
         if ckpt:
-            print("Unable to reach checkpoint file %s." % ckpt.model_checkpoint_path)
+            print(
+                "Unable to reach checkpoint file %s." %
+                ckpt.model_checkpoint_path)
         print("Create model with fresh parameters")
         session.run(tf.global_variables_initializer())
     return model
 
-def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_beam_search=False):
+
+def get_predicted_sentence(
+        input_sentence,
+        vocab,
+        rev_vocab,
+        model,
+        sess,
+        use_beam_search=False):
     input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab)
 
-    bucket_id = min([b for b in range(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)])
+    bucket_id = min([b for b in range(len(BUCKETS))
+                     if BUCKETS[b][0] > len(input_token_ids)])
     outputs = []
 
     feed_data = {bucket_id: [(input_token_ids, outputs)]}
-    encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id)
+    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
+        feed_data, bucket_id)
 
     if use_beam_search:
-        _, _, output_words = model.step_beam_search(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
+        _, _, output_words = model.step(
+            sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True)
         outputs = output_words[1:]
-        output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs])
+        output_sentence = ' '.join([rev_vocab[token_id]
+                                    for token_id in outputs])
 
     else:
-        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
+        _, _, output_logits = model.step(
+            sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
         outputs = []
         for logit in output_logits:
             selected_token_id = int(np.argmax(logit, axis=1))
@@ -63,4 +78,4 @@ def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_be
                 outputs.append(selected_token_id)
         output_sentence = ' '.join([rev_vocab[output] for output in outputs])
 
-    return output_sentence
\ No newline at end of file
+    return output_sentence
diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index d99c913..ab14f7f 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -11,21 +11,27 @@
 from tf_chatbot.lib.data_utils import read_data
 from tf_chatbot.lib import data_utils
 
+
 def train():
     print("Preparing dialog data in %s" % FLAGS.data_dir)
-    train_data, dev_data, _ = data_utils.prepare_dialog_data(FLAGS.data_dir, FLAGS.vocab_size)
+    train_data, dev_data, _ = data_utils.prepare_dialog_data(
+        FLAGS.data_dir, FLAGS.vocab_size)
 
     with tf.Session() as sess:
-        print ("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
+        print (
+            "Creating %d layers of %d units." %
+            (FLAGS.num_layers, FLAGS.size))
         model = create_model(sess, forward_only=False)
 
-        print ("Reading development and training data (limit:%d)." % FLAGS.max_train_data_size)
+        print (
+            "Reading development and training data (limit:%d)." %
+            FLAGS.max_train_data_size)
         dev_set = read_data(dev_data)
         train_set = read_data(train_data, FLAGS.max_train_data_size)
         train_bucket_sizes = [len(train_set[b]) for b in range(len(BUCKETS))]
         train_total_size = float(sum(train_bucket_sizes))
 
-        train_buckets_scale = [sum(train_bucket_sizes[:i+1]) / train_total_size
+        train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                                for i in range(len(train_bucket_sizes))]
 
         step_time, loss = 0.0, 0.0
@@ -44,29 +50,40 @@ def train():
             _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                          target_weights, bucket_id, forward_only=False)
 
-            step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
+            step_time += (time.time() - start_time) / \
+                FLAGS.steps_per_checkpoint
             loss += step_loss / FLAGS.steps_per_checkpoint
             current_step += 1
 
             if current_step % FLAGS.steps_per_checkpoint == 0:
                 perplexity = math.exp(loss) if loss < 300 else float('inf')
-                print ("global step %d learning rate %.4f step-time %.2f perplexity %.2f" %
-                       (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity))
+                print (
+                    "global step %d learning rate %.4f step-time %.2f perplexity %.2f" %
+                    (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity))
 
-                if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
+                if len(previous_losses) > 2 and loss > max(
+                        previous_losses[-3:]):
                     sess.run(model.learning_rate_decay_op)
 
                 previous_losses.append(loss)
 
                 checkpoint_path = os.path.join(FLAGS.model_dir, 'model.ckpt')
-                model.saver.save(sess, checkpoint_path, global_step=model.global_step)
+                model.saver.save(
+                    sess,
+                    checkpoint_path,
+                    global_step=model.global_step)
                 step_time, loss = 0.0, 0.0
 
                 for bucket_id in range(len(BUCKETS)):
-                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(dev_set, bucket_id)
-                    _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
-
-                    eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
-                    print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
-
-                sys.stdout.flush()
\ No newline at end of file
+                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
+                        dev_set, bucket_id)
+                    _, eval_loss, _ = model.step(
+                        sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
+
+                    eval_ppx = math.exp(
+                        eval_loss) if eval_loss < 300 else float('inf')
+                    print(
+                        "  eval: bucket %d perplexity %.2f" %
+                        (bucket_id, eval_ppx))
+
+                sys.stdout.flush()

From 25d6c386822d45f5684299d6b4eb1ad0cedf565a Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Fri, 12 May 2017 16:05:32 +0800
Subject: [PATCH 03/14] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tf_chatbot/configs/config.py | 2 +-
 tf_chatbot/lib/train.py      | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py
index 918ef9a..4f136c7 100644
--- a/tf_chatbot/configs/config.py
+++ b/tf_chatbot/configs/config.py
@@ -12,7 +12,7 @@
 tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm')
 tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training')
 
-tf.app.flags.DEFINE_integer('vocab_size', 1000, 'Dialog vocabulary size')
+tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size')
 tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer')
 tf.app.flags.DEFINE_integer('num_layers', 1, 'Numbers of layers in the model')
 tf.app.flags.DEFINE_integer('beam_search_size', 3, 'Size of beam search op')
diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index ab14f7f..df6b1db 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -50,8 +50,7 @@ def train():
             _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                          target_weights, bucket_id, forward_only=False)
 
-            step_time += (time.time() - start_time) / \
-                FLAGS.steps_per_checkpoint
+            step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
             loss += step_loss / FLAGS.steps_per_checkpoint
             current_step += 1
 

From 75bce23f30b9f7a4e8519a203ec66966017caeda Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Fri, 12 May 2017 16:42:34 +0800
Subject: [PATCH 04/14] =?UTF-8?q?=E9=92=88=E5=AF=B9Windows=E7=B3=BB?=
 =?UTF-8?q?=E7=BB=9F=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86=E8=AF=BB=E5=86=99?=
 =?UTF-8?q?=E6=93=8D=E4=BD=9C=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 legacy_models/main.py        | 35 +++++++++++++++++++++++++++++++++++
 tf_chatbot/lib/data_utils.py | 19 ++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)
 create mode 100644 legacy_models/main.py

diff --git a/legacy_models/main.py b/legacy_models/main.py
new file mode 100644
index 0000000..09f7355
--- /dev/null
+++ b/legacy_models/main.py
@@ -0,0 +1,35 @@
+# -*- coding:utf8 -*-
+import tensorflow as tf
+
+from legacy_models.model.lstm.model import Config, Model
+
+# 对训练数据进行切割
+# splitData()
+
+
+config = Config()
+#config.is_pretrained = False
+model = Model(config)
+sess = tf.Session()
+model.variables_init(sess)
+model.restore(sess, 24000)
+model.train(sess)
+model.loss_tracker.savefig(config.save_path)
+
+resonse = model.generate(sess, "我 对此 感到 非常 开心")
+print(resonse)
+
+'''
+vocab_to_idx, idx_to_vocab, vocab_embed = loadPretrainedVector(30, 50, "./dict/vector/wiki.zh.text200.vector")
+
+for k in vocab_to_idx.keys():
+    if u"他"==k:
+        print(k, vocab_to_idx[k])
+
+'''
+
+#for k in idx_to_vocab.keys():
+#    print(k, idx_to_vocab[k])
+
+#for i in vocab_embed:
+#    print(i)
diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py
index e823e33..038c1fc 100644
--- a/tf_chatbot/lib/data_utils.py
+++ b/tf_chatbot/lib/data_utils.py
@@ -5,6 +5,7 @@
 import os
 import re
 import sys
+import platform
 import json
 
 from tensorflow.python.platform import gfile
@@ -134,15 +135,15 @@ def sentence_to_token_ids(sentence, vocabulary,
         words = tokenizer(sentence)
     else:
         words = basic_tokenizer(sentence)
-    if not normalize_digits:
-        return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words]
-    return [
-        vocabulary.get(
-            re.sub(
-                _DIGIT_RE,
-                "0",
-                w.encode('utf8')),
-            UNK_ID) for w in words]
+    if platform.system() == "Windows":
+        if not normalize_digits:
+            return [vocabulary.get(w, UNK_ID) for w in words]
+        return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words]
+
+    else:
+        if not normalize_digits:
+            return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words]
+        return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words]
 
 
 def data_to_token_ids_bak(data_path, target_path, vocabulary_path,

From b6af9ce229e5a8f12bc9c432d70d69482c6e137e Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Sat, 13 May 2017 01:48:05 +0800
Subject: [PATCH 05/14] Fix bugs

---
 tf_chatbot/lib/predict.py             |  2 +-
 tf_chatbot/lib/seq2seq_model.py       | 34 +++++++++++++++++++++++----
 tf_chatbot/lib/seq2seq_model_utils.py |  5 +++-
 tf_chatbot/lib/train.py               |  2 +-
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index 6f61c99..bfe8a77 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -34,7 +34,7 @@ def _get_test_dataset():
         for sentence in test_dataset:
             predicted_sentence = get_predicted_sentence(
                 sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
-            print(sentence, '->')
+            print(sentence.strip(), '->')
             print(predicted_sentence)
 
             results_fh.write(predicted_sentence + '\n')
diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index dce3fbc..3a372a6 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -13,6 +13,20 @@
 from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper
 
 
+def _extract_argmax_and_embed(embedding,
+                              output_projection=None,
+                              update_embedding=True):
+    def loop_function(prev, _):
+        if output_projection is not None:
+            prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1])
+        prev_symbol = tf.argmax(prev, 1)
+        emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
+        if not update_embedding:
+            emb_prev = tf.stop_gradient(emb_prev)
+        return emb_prev
+
+    return loop_function
+
 def _extract_sample_and_embed(embedding,
                               output_projection=None,
                               update_embedding=True):
@@ -42,7 +56,7 @@ def embedding_attention_decoder(decoder_inputs,
                                 feed_previous=False,
                                 update_embedding_for_previous=True,
                                 dtype=None,
-                                scope=None,
+                                use_sample=True,
                                 initial_state_attention=False):
     if output_size is None:
         output_size = cell.output_size
@@ -53,9 +67,17 @@ def embedding_attention_decoder(decoder_inputs,
     with tf.variable_scope("embedding_attention_decoder", dtype=dtype):
         embedding = tf.get_variable("embedding", [num_symbols, embedding_size])
 
-        loop_function = _extract_sample_and_embed(
-            embedding, output_projection,
-            update_embedding_for_previous) if feed_previous else None
+        if feed_previous:
+            if use_sample:
+                loop_function = _extract_sample_and_embed(
+                    embedding, output_projection,
+                    update_embedding_for_previous)
+            else:
+                loop_function = _extract_argmax_and_embed(
+                    embedding, output_projection,
+                    update_embedding_for_previous)
+        else:
+            loop_function = None
         emb_inp = [
             tf.nn.embedding_lookup(
                 embedding,
@@ -85,6 +107,7 @@ def __init__(self,
                  learning_rate_decay_factor,
                  use_lstm=False,
                  num_samples=512,
+                 use_sample=True,
                  forward_only=False,
                  beam_search_size=1,
                  dtype=tf.float32):
@@ -99,6 +122,7 @@ def __init__(self,
             self.learning_rate * learning_rate_decay_factor)
         self.global_step = tf.Variable(0, trainable=False)
         self.beam_search_size = beam_search_size
+        self.use_sample = use_sample
 
         # If we use sampled softmax, we need an output projection.
         output_projection = None
@@ -392,7 +416,7 @@ def step(
                     new_beams.sort(key=lambda x: x[0], reverse=True)
                     beams = []
                     for beam_ in new_beams:
-                        if beam_[2] == data_utils.EOS_ID:
+                        if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2:
                             result.append(
                                 (beam_[0], beam_[1][:-1], beam_[2], beam_[3]))
                         else:
diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 10f9bb8..38766f8 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -13,7 +13,7 @@
 _INDEX = ".index"
 
 
-def create_model(session, forward_only):
+def create_model(session, forward_only, use_sample=True):
     model = seq2seq_model.Seq2SeqModel(
         source_vocab_size=FLAGS.vocab_size,
         target_vocab_size=FLAGS.vocab_size,
@@ -25,6 +25,7 @@ def create_model(session, forward_only):
         learning_rate=FLAGS.learning_rate,
         learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
         use_lstm=False,
+        use_sample=use_sample,
         beam_search_size=FLAGS.beam_search_size,
         forward_only=forward_only)
 
@@ -62,6 +63,8 @@ def get_predicted_sentence(
     if use_beam_search:
         _, _, output_words = model.step(
             sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True)
+        print("HERE!!!!!")
+        print(output_words)
         outputs = output_words[1:]
         output_sentence = ' '.join([rev_vocab[token_id]
                                     for token_id in outputs])
diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index df6b1db..0ecd2f2 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -21,7 +21,7 @@ def train():
         print (
             "Creating %d layers of %d units." %
             (FLAGS.num_layers, FLAGS.size))
-        model = create_model(sess, forward_only=False)
+        model = create_model(sess, forward_only=False, )
 
         print (
             "Reading development and training data (limit:%d)." %

From 66e55cc25d6b18a60dfef51c84f586cce405e76d Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 09:25:53 +0800
Subject: [PATCH 06/14] Save changes

---
 tf_chatbot/lib/predict.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index bfe8a77..0f9a51e 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -20,7 +20,7 @@ def _get_test_dataset():
 
     with tf.Session() as sess, open(results_path, 'w') as results_fh:
 
-        model = create_model(sess, forward_only=True)
+        model = create_model(sess, forward_only=True, use_sample=False)
         model.batch_size = 1
 
         vocab_path = os.path.join(
@@ -33,7 +33,7 @@ def _get_test_dataset():
 
         for sentence in test_dataset:
             predicted_sentence = get_predicted_sentence(
-                sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
+                sentence, vocab, rev_vocab, model, sess, use_beam_search=False)
             print(sentence.strip(), '->')
             print(predicted_sentence)
 

From 4da925b1f54a44b8d080e21966bab8338de705d2 Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 10:25:13 +0800
Subject: [PATCH 07/14] Commit

---
 tf_chatbot/lib/seq2seq_model_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 10f9bb8..d87288c 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -13,7 +13,7 @@
 _INDEX = ".index"
 
 
-def create_model(session, forward_only):
+def gicreate_model(session, forward_only):
     model = seq2seq_model.Seq2SeqModel(
         source_vocab_size=FLAGS.vocab_size,
         target_vocab_size=FLAGS.vocab_size,

From 9609793d400c2076c9ae6c85ec8960a5b9480394 Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 10:36:05 +0800
Subject: [PATCH 08/14] Fix sample bug

---
 tf_chatbot/lib/seq2seq_model.py       | 9 ++++-----
 tf_chatbot/lib/seq2seq_model_utils.py | 2 +-
 tf_chatbot/lib/train.py               | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index 3a372a6..f09f60e 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -56,7 +56,7 @@ def embedding_attention_decoder(decoder_inputs,
                                 feed_previous=False,
                                 update_embedding_for_previous=True,
                                 dtype=None,
-                                use_sample=True,
+                                use_sample=False,
                                 initial_state_attention=False):
     if output_size is None:
         output_size = cell.output_size
@@ -79,9 +79,7 @@ def embedding_attention_decoder(decoder_inputs,
         else:
             loop_function = None
         emb_inp = [
-            tf.nn.embedding_lookup(
-                embedding,
-                i) for i in decoder_inputs]
+            tf.nn.embedding_lookup(embedding,i) for i in decoder_inputs]
 
         return attention_decoder(
             emb_inp,
@@ -107,7 +105,7 @@ def __init__(self,
                  learning_rate_decay_factor,
                  use_lstm=False,
                  num_samples=512,
-                 use_sample=True,
+                 use_sample=False,
                  forward_only=False,
                  beam_search_size=1,
                  dtype=tf.float32):
@@ -226,6 +224,7 @@ def embedding_attention_sampled_seq2seq(
                             output_size=output_size,
                             output_projection=output_projection,
                             feed_previous=feed_previous,
+                            use_sample=self.use_sample,
                             initial_state_attention=initial_state_attention)
 
                     else:
diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 38766f8..8db0dd8 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -13,7 +13,7 @@
 _INDEX = ".index"
 
 
-def create_model(session, forward_only, use_sample=True):
+def create_model(session, forward_only, use_sample=False):
     model = seq2seq_model.Seq2SeqModel(
         source_vocab_size=FLAGS.vocab_size,
         target_vocab_size=FLAGS.vocab_size,
diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index 0ecd2f2..df6b1db 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -21,7 +21,7 @@ def train():
         print (
             "Creating %d layers of %d units." %
             (FLAGS.num_layers, FLAGS.size))
-        model = create_model(sess, forward_only=False, )
+        model = create_model(sess, forward_only=False)
 
         print (
             "Reading development and training data (limit:%d)." %

From be659dd58c46d221a1e02d4d28d8bfb5d9466138 Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 13:54:18 +0800
Subject: [PATCH 09/14] Commit

---
 tf_chatbot/configs/config.py | 1 +
 tf_chatbot/lib/predict.py    | 2 +-
 tf_chatbot/lib/train.py      | 5 ++++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py
index 4f136c7..12882e7 100644
--- a/tf_chatbot/configs/config.py
+++ b/tf_chatbot/configs/config.py
@@ -11,6 +11,7 @@
 tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.99, 'Learning rate decays by this much.')
 tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm')
 tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training')
+tf.app.flags.DEFINE_integer('epoch_size', 20, 'Size of epoch')
 
 tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size')
 tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer')
diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index 0f9a51e..5a89ce6 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -20,7 +20,7 @@ def _get_test_dataset():
 
     with tf.Session() as sess, open(results_path, 'w') as results_fh:
 
-        model = create_model(sess, forward_only=True, use_sample=False)
+        model = create_model(sess, forward_only=True, use_sample=True)
         model.batch_size = 1
 
         vocab_path = os.path.join(
diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index df6b1db..556497f 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -38,7 +38,10 @@ def train():
         current_step = 0
         previous_losses = []
 
-        while True:
+        total_epoch = FLAGS.epoch_size
+        epoch_steps = np.sum([len(ts) for ts in train_set]) / FLAGS.batch_size + 1
+
+        while _ in range(epoch_steps * total_epoch):
             random_number_01 = np.random.random_sample()
             bucket_id = min([i for i in range(len(train_buckets_scale))
                              if train_buckets_scale[i] > random_number_01])

From bae3231021869325635ced79c645a4a9101938ea Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 14:01:28 +0800
Subject: [PATCH 10/14] fix bug

---
 tf_chatbot/lib/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py
index 556497f..9c46f35 100644
--- a/tf_chatbot/lib/train.py
+++ b/tf_chatbot/lib/train.py
@@ -41,7 +41,7 @@ def train():
         total_epoch = FLAGS.epoch_size
         epoch_steps = np.sum([len(ts) for ts in train_set]) / FLAGS.batch_size + 1
 
-        while _ in range(epoch_steps * total_epoch):
+        while model.global_step.eval() < (epoch_steps * total_epoch):
             random_number_01 = np.random.random_sample()
             bucket_id = min([i for i in range(len(train_buckets_scale))
                              if train_buckets_scale[i] > random_number_01])

From 676a31bc08da29405bfd1636ac6530024cd5f7cb Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Mon, 15 May 2017 20:01:13 +0800
Subject: [PATCH 11/14] debug beam search

---
 tf_chatbot/lib/predict.py             |  7 ++--
 tf_chatbot/lib/seq2seq_model.py       | 50 +++++++++++++++++----------
 tf_chatbot/lib/seq2seq_model_utils.py | 18 ++++++++--
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index 5a89ce6..c539af1 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -20,7 +20,7 @@ def _get_test_dataset():
 
     with tf.Session() as sess, open(results_path, 'w') as results_fh:
 
-        model = create_model(sess, forward_only=True, use_sample=True)
+        model = create_model(sess, forward_only=True, use_sample=False)
         model.batch_size = 1
 
         vocab_path = os.path.join(
@@ -33,8 +33,11 @@ def _get_test_dataset():
 
         for sentence in test_dataset:
             predicted_sentence = get_predicted_sentence(
-                sentence, vocab, rev_vocab, model, sess, use_beam_search=False)
+                sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
             print(sentence.strip(), '->')
             print(predicted_sentence)
+            # ----------For Debug ----------
+            #break
+            # ----------End Debug ----------
 
             results_fh.write(predicted_sentence + '\n')
diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index f09f60e..0b90b91 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -35,7 +35,7 @@ def loop_function(prev, _):
         if output_projection is not None:
             prev = tf.nn.xw_plus_b(
                 prev, output_projection[0], output_projection[1])
-            prev_symbol = tf.squeeze(tf.multinomial(prev, 1), axis=1)
+            prev_symbol = tf.squeeze(tf.multinomial(tf.nn.softmax(prev), 1), axis=1)
         emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol)
         if not update_embedding:
             emb_prev = tf.stop_gradient(emb_prev)
@@ -373,53 +373,65 @@ def step(
                 # attention_state, encoder_state
                 outputs = session.run(output_feed, input_feed)
                 # score, result, last_token, encoder_state
+
+                temp_encoder_states = outputs[1][0]
+
                 beams = [(0.0,
                           [data_utils.GO_ID],
                           data_utils.GO_ID,
-                          outputs[1])] * 3
+                          outputs[1][0])] * self.beam_search_size
                 result = []
                 step = 0
                 attention_state = outputs[0]
 
-                while step < decoder_size and len(
-                        result) < self.beam_search_size:
+                while step < decoder_size and len(result) < self.beam_search_size:
                     step += 1
-                    _last_tokens = [beam_[2] for beam_ in beams]
                     _encoder_state = [beam_[3] for beam_ in beams]
-                    output_feed = [
-                        self.topk_ids[bucket_id],
-                        self.topk_probs[bucket_id],
-                        self.decoder_out_state[bucket_id]]
+                    output_feed = self.outputs[bucket_id]
+                        #self.decoder_out_state[bucket_id]]
                     input_feed = {}
                     input_feed[self.model_attention_states[bucket_id].name] = attention_state
-                    input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(
-                        np.array(_encoder_state))
+                    input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state))
+
                     for l in range(step):
-                        _decoder_inputs = [beam_[1][l] for beam_ in beams]
+                        _decoder_inputs = np.array([beam_[1][l] for beam_ in beams])
                         input_feed[self.decoder_inputs[l].name] = _decoder_inputs
 
-                    _tok_ids, _tok_probs, _out_states = session.run(
-                        output_feed, input_feed)
+                    _outputs = session.run(output_feed, input_feed)
+
+                    _tok_probs, _tok_ids = [], []
+                    for _idx in range(self.beam_search_size):
+                        _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size)
+                        _tok_probs.append(_tok_prob.eval())
+                        _tok_ids.append(_tok_id.eval())
 
                     new_beams = []
 
                     for beam_idx in range(self.beam_search_size):
                         for _idx in range(self.beam_search_size):
+                            #print("before - ", "score:", beams[beam_idx][0], "strs:", beams[beam_idx][1], "next:", beams[beam_idx][2])
                             new_beams.append(
                                 (beams[beam_idx][0] + _tok_probs[beam_idx][_idx],
-                                 beams[beam_idx][1] + [
-                                    _tok_ids[beam_idx][_idx]],
-                                    _tok_ids[beam_idx][_idx],
-                                    _out_states[beam_idx]))
+                                 beams[beam_idx][1] + [_tok_ids[beam_idx][_idx]],
+                                 _tok_ids[beam_idx][_idx],
+                                 #_out_states[beam_idx]))
+                                 temp_encoder_states))
+                            #print("after - ", "score:", new_beams[-1][0], "strs:", new_beams[-1][1], "next:",new_beams[-1][2])
+                            #print("=========")
 
                     new_beams.sort(key=lambda x: x[0], reverse=True)
+
+                    unduplicate_set = set()
                     beams = []
                     for beam_ in new_beams:
+                        #if False:
                         if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2:
                             result.append(
                                 (beam_[0], beam_[1][:-1], beam_[2], beam_[3]))
                         else:
-                            beams.append(beam_)
+                            if str(beam_[1]) not in unduplicate_set:
+                                unduplicate_set.add(str(beam_[1]))
+                                beams.append(beam_)
                             if len(beams) == self.beam_search_size:
                                 break
 
diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py
index 8db0dd8..0544ffd 100644
--- a/tf_chatbot/lib/seq2seq_model_utils.py
+++ b/tf_chatbot/lib/seq2seq_model_utils.py
@@ -61,10 +61,22 @@ def get_predicted_sentence(
         feed_data, bucket_id)
 
     if use_beam_search:
+        new_encoder_inputs, new_decoder_inputs, new_target_weights = [],[],[]
+        for _array in decoder_inputs:
+            for _item in _array:
+                _de_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32)
+                new_decoder_inputs.append(_de_input)
+        for _array in encoder_inputs:
+            for _item in _array:
+                _en_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32)
+                new_encoder_inputs.append(_en_input)
+        for _array in target_weights:
+            for _item in _array:
+                _ta_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32)
+                new_target_weights.append(_ta_input)
+
         _, _, output_words = model.step(
-            sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True)
-        print("HERE!!!!!")
-        print(output_words)
+            sess, new_encoder_inputs, new_decoder_inputs, new_target_weights, bucket_id, forward_only=True, use_beam_search=True)
         outputs = output_words[1:]
         output_sentence = ' '.join([rev_vocab[token_id]
                                     for token_id in outputs])

From 61576e2191b085ab0f6e571d6b3061b69d2744f4 Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Tue, 16 May 2017 09:28:25 +0800
Subject: [PATCH 12/14] update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index eb7fcbb..a28edb5 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,2 @@
 # SMIPG-NLPCC2017
-Emotional Conversation Generation Task in NLPCC2017
+GRU + Attention + Beam Search (+ Sample)的Seq2Seq模型

From f1820520dfbed4cd21fd202d438dfbf842f9560e Mon Sep 17 00:00:00 2001
From: RuiZhang1993 <zhang1rui4@foxmail.com>
Date: Tue, 16 May 2017 16:55:10 +0800
Subject: [PATCH 13/14] update

---
 tf_chatbot/lib/seq2seq_model.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index 0b90b91..61160e2 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -384,6 +384,9 @@ def step(
                 step = 0
                 attention_state = outputs[0]
 
+                def numpy_softmax(x):
+                    return np.exp(x) / np.sum(np.exp(x), axis=0)
+
                 while step < decoder_size and len(result) < self.beam_search_size:
                     step += 1
                     _encoder_state = [beam_[3] for beam_ in beams]
@@ -400,10 +403,16 @@ def step(
                     _outputs = session.run(output_feed, input_feed)
 
                     _tok_probs, _tok_ids = [], []
-                    for _idx in range(self.beam_search_size):
-                        _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size)
-                        _tok_probs.append(_tok_prob.eval())
-                        _tok_ids.append(_tok_id.eval())
+
+                    if step == 1:
+                        for _idx in range(self.beam_search_size):
+                            _tok_ids.append(np.random.choice(range(self.target_vocab_size), size=self.beam_search_size, replace=False, p=numpy_softmax(_outputs[step-1][_idx])))
+                            _tok_probs.append(_outputs[step-1][_idx][_tok_ids[_idx]])
+                    else:
+                        for _idx in range(self.beam_search_size):
+                            _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size)
+                            _tok_probs.append(_tok_prob.eval())
+                            _tok_ids.append(_tok_id.eval())
 
                     new_beams = []
 
@@ -425,7 +434,7 @@ def step(
                     beams = []
                     for beam_ in new_beams:
                         #if False:
-                        if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2:
+                        if beam_[2] == data_utils.EOS_ID:
                             result.append(
                                 (beam_[0], beam_[1][:-1], beam_[2], beam_[3]))
                         else:

From 527c2c1a433b9775525f92f2662272d50bf5014e Mon Sep 17 00:00:00 2001
From: RuiCheung <zhang1rui4@foxmail.com>
Date: Wed, 17 May 2017 13:47:44 +0800
Subject: [PATCH 14/14] Fix bugs

---
 tf_chatbot/configs/config.py    |  3 +++
 tf_chatbot/lib/predict.py       |  7 ++-----
 tf_chatbot/lib/seq2seq_model.py | 28 +++++++++++++++++++++-------
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py
index 12882e7..79fe901 100644
--- a/tf_chatbot/configs/config.py
+++ b/tf_chatbot/configs/config.py
@@ -21,6 +21,9 @@
 tf.app.flags.DEFINE_integer('max_train_data_size', 0, 'Limit on the size of training data (0: no limit)')
 tf.app.flags.DEFINE_integer('steps_per_checkpoint', 100, 'How many training steps to do per checkpoint')
 
+tf.app.flags.DEFINE_boolean('use_sample', True, 'use sample while generating')
+tf.app.flags.DEFINE_boolean('use_beam_search', True, 'use beam search while generating')
+
 FLAGS = tf.app.flags.FLAGS
 
 BUCKETS = [(5,10), (10, 15), (20, 25), (40, 50)]
\ No newline at end of file
diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py
index c539af1..e1266f5 100644
--- a/tf_chatbot/lib/predict.py
+++ b/tf_chatbot/lib/predict.py
@@ -20,7 +20,7 @@ def _get_test_dataset():
 
     with tf.Session() as sess, open(results_path, 'w') as results_fh:
 
-        model = create_model(sess, forward_only=True, use_sample=False)
+        model = create_model(sess, forward_only=True, use_sample=FLAGS.use_sample)
         model.batch_size = 1
 
         vocab_path = os.path.join(
@@ -33,11 +33,8 @@ def _get_test_dataset():
 
         for sentence in test_dataset:
             predicted_sentence = get_predicted_sentence(
-                sentence, vocab, rev_vocab, model, sess, use_beam_search=True)
+                sentence, vocab, rev_vocab, model, sess, use_beam_search=FLAGS.use_beam_search)
             print(sentence.strip(), '->')
             print(predicted_sentence)
-            # ----------For Debug ----------
-            #break
-            # ----------End Debug ----------
 
             results_fh.write(predicted_sentence + '\n')
diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py
index 61160e2..fc69eb2 100644
--- a/tf_chatbot/lib/seq2seq_model.py
+++ b/tf_chatbot/lib/seq2seq_model.py
@@ -10,7 +10,7 @@
 
 import tf_chatbot.lib.data_utils as data_utils
 from tensorflow.contrib.legacy_seq2seq import sequence_loss, attention_decoder
-from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper
+from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper, static_bidirectional_rnn
 
 
 def _extract_argmax_and_embed(embedding,
@@ -104,6 +104,7 @@ def __init__(self,
                  learning_rate,
                  learning_rate_decay_factor,
                  use_lstm=False,
+                 use_bidirection=True,
                  num_samples=512,
                  use_sample=False,
                  forward_only=False,
@@ -128,9 +129,10 @@ def __init__(self,
         # Sampled softmax only makes sense if we sample less than vocabulary
         # size.
         if num_samples > 0 and num_samples < self.target_vocab_size:
-            w_t = tf.get_variable(
-                "proj_w", [
-                    self.target_vocab_size, size], dtype=dtype)
+            if use_bidirection:
+                w_t = tf.get_variable("proj_w", [self.target_vocab_size, size * 2], dtype=dtype)
+            else:
+                w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
             w = tf.transpose(w_t)
             b = tf.get_variable(
                 "proj_b", [
@@ -187,6 +189,7 @@ def embedding_attention_sampled_seq2seq(
                     output_projection=None,
                     feed_previous=False,
                     initial_state_attention=False,
+                    use_bidirection=False,
                     dtype=tf.float32):
                 with tf.variable_scope("embedding_attention_sampled_seq2seq"):
                     encoder_cell = EmbeddingWrapper(
@@ -194,8 +197,18 @@ def embedding_attention_sampled_seq2seq(
                         embedding_classes=num_encoder_symbols,
                         embedding_size=embedding_size
                     )
-                    encoder_outputs, encoder_state = static_rnn(
-                        encoder_cell, encoder_inputs, dtype=dtype)
+                    if not use_bidirection:
+                        encoder_outputs, encoder_state = static_rnn(
+                            encoder_cell, encoder_inputs, dtype=dtype)
+                    else:
+                        encoder_outputs, encoder_state_fw, encoder_state_bw = static_bidirectional_rnn(
+                            cell_fw=encoder_cell,
+                            cell_bw=encoder_cell,
+                            inputs=encoder_inputs,
+                            dtype=dtype)
+                        encoder_state = tf.concat([encoder_state_fw, encoder_state_bw], axis=1)
+
+                        cell = GRUCell(cell.state_size * 2)
 
                     top_states = [tf.reshape(
                         e, [-1, 1, cell.output_size]) for e in encoder_outputs]
@@ -239,7 +252,8 @@ def embedding_attention_sampled_seq2seq(
                 embedding_size=size,
                 bucket_index=bucket_id,
                 output_projection=output_projection,
-                feed_previous=do_decode)
+                feed_previous=do_decode,
+                use_bidirection=use_bidirection)
 
         # Feeds for inputs.
         self.encoder_inputs = []