Merge pull request #100 from kolloldas/lstm_attention

lukaszkaiser · web-flow · commit 228feae7a430 · 2017-07-05T12:56:06.000-07:00
Adding attention to LSTM seq2seq baseline
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
@@ -331,7 +331,6 @@ def conv2d_kernel(kernel_size_arg, name_suffix):
 
   return conv2d_kernel(kernel_size, "single")
 
-
 def conv(inputs, filters, kernel_size, **kwargs):
   return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs)
 
diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py
@@ -21,11 +21,152 @@
 # Dependency imports
 
 from tensor2tensor.models import common_layers
+from tensor2tensor.models import common_hparams
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.util import nest
 
+import collections
+
+# Track Tuple of state and attention values
+AttentionTuple = collections.namedtuple("AttentionTuple", ("state", "attention"))
+
+
+class ExternalAttentionCellWrapper(rnn_cell_impl.RNNCell):
+  """
+    Wrapper for external attention states. To be used in an encoder-decoder setup
+  """
+  def __init__(self, cell, attn_states, attn_vec_size=None,
+              input_size=None, state_is_tuple=True, reuse=None):
+    """Create a cell with attention.
+      Args:
+        cell: an RNNCell, an attention is added to it.
+        attn_states: External attention states typically the encoder output in the
+            form [batch_size, time steps, hidden size]
+        attn_vec_size: integer, the number of convolutional features calculated
+            on attention state and a size of the hidden layer built from
+            base cell state. Equal attn_size to by default.
+        input_size: integer, the size of a hidden linear layer,
+            built from inputs and attention. Derived from the input tensor
+            by default.
+        state_is_tuple: If True, accepted and returned states are n-tuples, where
+          `n = len(cells)`.  Must be set to True else will raise an exception
+          concatenated along the column axis.
+        reuse: (optional) Python boolean describing whether to reuse variables
+          in an existing scope.  If not `True`, and the existing scope already has
+          the given variables, an error is raised.
+      Raises:
+        TypeError: if cell is not an RNNCell.
+        ValueError: if the flag `state_is_tuple` is `False` or 
+        if shape of attn_states is not 3 or if innermost dimension (hidden size) is None.
+    """
+    super(ExternalAttentionCellWrapper, self).__init__(_reuse=reuse)
+    if not rnn_cell_impl._like_rnncell(cell): # pylint: disable=protected-access
+      raise TypeError("The parameter cell is not RNNCell.")
+
+    if not state_is_tuple:
+      raise ValueError("Only tuple state is supported")
+
+    self._cell = cell
+    self._input_size = input_size
+
+    #Validate attn_states shape
+    attn_shape = attn_states.get_shape()
+    if not attn_shape or len(attn_shape) != 3:
+      raise ValueError("attn_shape must be rank 3")
+
+    self._attn_states = attn_states
+    self._attn_size = attn_shape[2].value
+    if self._attn_size is None:
+      raise ValueError("Hidden size of attn_states cannot be None")
+    
+    self._attn_vec_size = attn_vec_size
+    if self._attn_vec_size is None:
+      self._attn_vec_size = self._attn_size
+
+    self._reuse = reuse
+  
+  @property
+  def state_size(self):
+    return AttentionTuple(self._cell.state_size, self._attn_size)
+
+
+  @property
+  def output_size(self):
+    return self._attn_size
+
+  def combine_state(self, previous_state):
+    """
+      Combines previous state (usually from an encoder) with the internal attention values
+      You must use this function to derive the initial state passed into this cell as it expects
+      a named tuple (AttentionTuple)
+      Args:
+        previous_state: State from another block that will be fed into this cell. Must have same 
+        structure as the state of the cell wrapped by this
+      Returns:
+        Combined state (AttentionTuple)
+    """
+    batch_size = self._attn_states.get_shape()[0].value
+    if batch_size is None:
+      batch_size = tf.shape(self._attn_states)[0]
+    zeroed_state = self.zero_state(batch_size, self._attn_states.dtype)
+    return AttentionTuple(previous_state, zeroed_state.attention)
+
+  def call(self, inputs, state):
+    """Long short-term memory cell with attention (LSTMA)."""
+
+    if(not isinstance(state, AttentionTuple)):
+      raise TypeError("State must be of type AttentionTuple")
+    
+    state, attns = state
+    attn_states = self._attn_states  
+    attn_length = attn_states.get_shape()[1].value
+    if attn_length is None:
+      attn_length = tf.shape(attn_states)[1]
+    
+
+    input_size = self._input_size
+    if input_size is None:
+      input_size = inputs.get_shape().as_list()[1]
+    if(attns is not None):
+      inputs = rnn_cell_impl._linear([inputs, attns], input_size, True)
+    lstm_output, new_state = self._cell(inputs, state)
+    
+    new_state_cat = tf.concat(nest.flatten(new_state), 1)
+    new_attns = self._attention(new_state_cat, attn_states, attn_length)
+    
+    with tf.variable_scope("attn_output_projection"):
+      output = rnn_cell_impl._linear([lstm_output, new_attns], self._attn_size, True)
+
+    new_state = AttentionTuple(new_state, new_attns)
+
+    return output, new_state
+
+  def _attention(self, query, attn_states, attn_length):
+    conv2d = tf.nn.conv2d
+    reduce_sum = tf.reduce_sum
+    softmax = tf.nn.softmax
+    tanh = tf.tanh
+
+    with tf.variable_scope("attention"):
+      k = tf.get_variable(
+          "attn_w", [1, 1, self._attn_size, self._attn_vec_size])
+      v = tf.get_variable("attn_v", [self._attn_vec_size, 1])
+      hidden = tf.reshape(attn_states,
+                                 [-1, attn_length, 1, self._attn_size])
+      hidden_features = conv2d(hidden, k, [1, 1, 1, 1], "SAME")
+      y = rnn_cell_impl._linear(query, self._attn_vec_size, True)
+      y = tf.reshape(y, [-1, 1, 1, self._attn_vec_size])
+      s = reduce_sum(v * tanh(hidden_features + y), [2, 3])
+      a = softmax(s)
+      d = reduce_sum(
+          tf.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
+      new_attns = tf.reshape(d, [-1, self._attn_size])
+
+      return new_attns
 
 def lstm(inputs, hparams, train, name, initial_state=None):
   """Run LSTM cell on inputs, assuming they are [batch x time x size]."""
@@ -44,6 +185,25 @@ def dropout_lstm_cell():
         dtype=tf.float32,
         time_major=False)
 
+def lstm_attention_decoder(inputs, hparams, train, name, initial_state, attn_states):
+  """Run LSTM cell with attention on inputs, assuming they are [batch x time x size]."""
+
+  def dropout_lstm_cell():
+    return tf.contrib.rnn.DropoutWrapper(
+        tf.nn.rnn_cell.BasicLSTMCell(hparams.hidden_size),
+        input_keep_prob=1.0 - hparams.dropout * tf.to_float(train))
+
+  layers = [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]
+  cell = ExternalAttentionCellWrapper(tf.nn.rnn_cell.MultiRNNCell(layers), attn_states, 
+         attn_vec_size=hparams.attn_vec_size)
+  initial_state = cell.combine_state(initial_state)
+  with tf.variable_scope(name):
+    return tf.nn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=initial_state,
+        dtype=tf.float32,
+        time_major=False)
 
 def lstm_seq2seq_internal(inputs, targets, hparams, train):
   """The basic LSTM seq2seq model, main step used for training."""
@@ -63,6 +223,23 @@ def lstm_seq2seq_internal(inputs, targets, hparams, train):
         initial_state=final_encoder_state)
     return tf.expand_dims(decoder_outputs, axis=2)
 
+def lstm_seq2seq_internal_attention(inputs, targets, hparams, train):
+  """LSTM seq2seq model with attention, main step used for training."""
+  with tf.variable_scope("lstm_seq2seq_attention"):
+    # Flatten inputs.
+    inputs = common_layers.flatten4d3d(inputs)
+    # LSTM encoder.
+    encoder_outputs, final_encoder_state = lstm(
+        tf.reverse(inputs, axis=[1]), hparams, train, "encoder")
+    # LSTM decoder with attention
+    shifted_targets = common_layers.shift_left(targets)
+    decoder_outputs, _ = lstm_attention_decoder(
+        common_layers.flatten4d3d(shifted_targets),
+        hparams,
+        train,
+        "decoder",
+        final_encoder_state, encoder_outputs)
+    return tf.expand_dims(decoder_outputs, axis=2)
 
 @registry.register_model("baseline_lstm_seq2seq")
 class LSTMSeq2Seq(t2t_model.T2TModel):
@@ -71,3 +248,23 @@ def model_fn_body(self, features):
     train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     return lstm_seq2seq_internal(features["inputs"], features["targets"],
                                  self._hparams, train)
+
+@registry.register_model("baseline_lstm_seq2seq_attention")
+class LSTMSeq2SeqAttention(t2t_model.T2TModel):
+
+  def model_fn_body(self, features):
+    train = self._hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
+    return lstm_seq2seq_internal_attention(features["inputs"], features["targets"],
+                                 self._hparams, train)
+
+@registry.register_hparams
+def lstm_attention():
+  """hparams for LSTM with attention."""
+  hparams = common_hparams.basic_params1()
+  hparams.batch_size = 128
+  hparams.hidden_size = 128
+  hparams.num_hidden_layers = 2
+
+  # Attention
+  hparams.add_hparam("attn_vec_size", hparams.hidden_size)
+  return hparams
diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py
@@ -51,6 +51,29 @@ def testLSTMSeq2Seq(self):
       res = session.run(logits)
     self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
 
+  def testLSTMSeq2Seq_attention(self):
+    vocab_size = 9
+    x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1))
+    y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1))
+    hparams = lstm.lstm_attention()
+
+    p_hparams = problem_hparams.test_problem_hparams(hparams, vocab_size,
+                                                     vocab_size)
+    x = tf.constant(x, dtype=tf.int32)
+    x._shape = tf.TensorShape([None, None, 1, 1])
+    
+    with self.test_session() as session:
+      features = {
+          "inputs": x,
+          "targets": tf.constant(y, dtype=tf.int32),
+      }
+      model = lstm.LSTMSeq2SeqAttention(
+          hparams, tf.contrib.learn.ModeKeys.TRAIN, p_hparams)
+      sharded_logits, _, _ = model.model_fn(features)
+      logits = tf.concat(sharded_logits, 0)
+      session.run(tf.global_variables_initializer())
+      res = session.run(logits)
+    self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size))
 
 if __name__ == "__main__":
   tf.test.main()