From 446279fb64fb099fe50b97b21eda913a2c3e4044 Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Fri, 12 May 2017 14:46:17 +0800 Subject: [PATCH 01/14] =?UTF-8?q?=E6=B7=BB=E5=8A=A0beam=20search?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_chatbot/configs/config.py | 3 +- tf_chatbot/lib/basic/advanced_seq2seq.py | 699 ++++++++++++----------- tf_chatbot/lib/data_utils.py | 8 +- tf_chatbot/lib/predict.py | 7 +- tf_chatbot/lib/seq2seq_model.py | 245 +++++--- tf_chatbot/lib/seq2seq_model_utils.py | 28 +- 6 files changed, 558 insertions(+), 432 deletions(-) diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py index 6e7f4a7..918ef9a 100644 --- a/tf_chatbot/configs/config.py +++ b/tf_chatbot/configs/config.py @@ -12,9 +12,10 @@ tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm') tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training') -tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size') +tf.app.flags.DEFINE_integer('vocab_size', 1000, 'Dialog vocabulary size') tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer') tf.app.flags.DEFINE_integer('num_layers', 1, 'Numbers of layers in the model') +tf.app.flags.DEFINE_integer('beam_search_size', 3, 'Size of beam search op') tf.app.flags.DEFINE_integer('max_train_data_size', 0, 'Limit on the size of training data (0: no limit)') tf.app.flags.DEFINE_integer('steps_per_checkpoint', 100, 'How many training steps to do per checkpoint') diff --git a/tf_chatbot/lib/basic/advanced_seq2seq.py b/tf_chatbot/lib/basic/advanced_seq2seq.py index 6925393..9812072 100644 --- a/tf_chatbot/lib/basic/advanced_seq2seq.py +++ b/tf_chatbot/lib/basic/advanced_seq2seq.py @@ -3,8 +3,8 @@ from __future__ import print_function # We disable pylint because we need python3 compatibility. -from six.moves import xrange # pylint: disable=redefined-builtin -from six.moves import zip # pylint: disable=redefined-builtin +# from six.moves import xrange # pylint: disable=redefined-builtin +# from six.moves import zip # pylint: disable=redefined-builtin from tensorflow.contrib.rnn.python.ops import core_rnn from tensorflow.contrib.rnn.python.ops import core_rnn_cell @@ -19,6 +19,7 @@ from tensorflow.python.ops import variable_scope from tensorflow.python.util import nest from tensorflow import multinomial, squeeze +import tensorflow as tf # TODO(ebrevdo): Remove once _linear is fully deprecated. linear = core_rnn_cell_impl._linear # pylint: disable=protected-access @@ -27,18 +28,18 @@ def _extract_sample_and_embed(embedding, output_projection=None, update_embedding=True): + def loop_function(prev, _): + if output_projection is not None: + prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) + # prev_symbol = math_ops.argmax(prev, 1) + prev_symbol = squeeze(multinomial(prev, 1), axis=1) + emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) + if not update_embedding: + emb_prev = array_ops.stop_gradient(emb_prev) + return emb_prev - def loop_function(prev, _): - if output_projection is not None: - prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) - #prev_symbol = math_ops.argmax(prev, 1) - prev_symbol = squeeze(multinomial(prev, 1), axis=1) - emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) - if not update_embedding: - emb_prev = array_ops.stop_gradient(emb_prev) - return emb_prev + return loop_function - return loop_function def embedding_attention_sampled_seq2seq(encoder_inputs, decoder_inputs, @@ -51,121 +52,124 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, feed_previous=False, dtype=None, scope=None, - initial_state_attention=False): - """Embedding sequence-to-sequence model with attention. - - This model first embeds encoder_inputs by a newly created embedding (of shape - [num_encoder_symbols x input_size]). Then it runs an RNN to encode - embedded encoder_inputs into a state vector. It keeps the outputs of this - RNN at every step to use for attention later. Next, it embeds decoder_inputs - by another newly created embedding (of shape [num_decoder_symbols x - input_size]). Then it runs attention decoder, initialized with the last - encoder state, on embedded decoder_inputs and attending to encoder outputs. - - Warning: when output_projection is None, the size of the attention vectors - and variables will be made proportional to num_decoder_symbols, can be large. - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - cell: core_rnn_cell.RNNCell defining the cell function and size. - num_encoder_symbols: Integer; number of symbols on the encoder side. - num_decoder_symbols: Integer; number of symbols on the decoder side. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_decoder_symbols] and B has - shape [num_decoder_symbols]; if provided and feed_previous=True, each - fed previous output will first be multiplied by W and added B. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first - of decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype of the initial RNN state (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_seq2seq". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x num_decoder_symbols] containing the generated - outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope( - scope or "embedding_attention_seq2seq", dtype=dtype) as scope: - dtype = scope.dtype - # Encoder. - encoder_cell = core_rnn_cell.EmbeddingWrapper( - cell, - embedding_classes=num_encoder_symbols, - embedding_size=embedding_size) - encoder_outputs, encoder_state = core_rnn.static_rnn( - encoder_cell, encoder_inputs, dtype=dtype) - - # First calculate a concatenation of encoder outputs to put attention on. - top_states = [ - array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs - ] - attention_states = array_ops.concat(top_states, 1) - - # Decoder. - output_size = None - if output_projection is None: - cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) - output_size = num_decoder_symbols - - if isinstance(feed_previous, bool): - return embedding_attention_decoder( - decoder_inputs, - encoder_state, - attention_states, - cell, - num_decoder_symbols, - embedding_size, - num_heads=num_heads, - output_size=output_size, - output_projection=output_projection, - feed_previous=feed_previous, - initial_state_attention=initial_state_attention) - - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def decoder(feed_previous_bool): - reuse = None if feed_previous_bool else True - with variable_scope.variable_scope( - variable_scope.get_variable_scope(), reuse=reuse) as scope: - outputs, state = embedding_attention_decoder( - decoder_inputs, - encoder_state, - attention_states, + initial_state_attention=False, + batch_size=None): + """Embedding sequence-to-sequence model with attention. + + This model first embeds encoder_inputs by a newly created embedding (of shape + [num_encoder_symbols x input_size]). Then it runs an RNN to encode + embedded encoder_inputs into a state vector. It keeps the outputs of this + RNN at every step to use for attention later. Next, it embeds decoder_inputs + by another newly created embedding (of shape [num_decoder_symbols x + input_size]). Then it runs attention decoder, initialized with the last + encoder state, on embedded decoder_inputs and attending to encoder outputs. + + Warning: when output_projection is None, the size of the attention vectors + and variables will be made proportional to num_decoder_symbols, can be large. + + Args: + encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. + cell: core_rnn_cell.RNNCell defining the cell function and size. + num_encoder_symbols: Integer; number of symbols on the encoder side. + num_decoder_symbols: Integer; number of symbols on the decoder side. + embedding_size: Integer, the length of the embedding vector for each symbol. + num_heads: Number of attention heads that read from attention_states. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_decoder_symbols] and B has + shape [num_decoder_symbols]; if provided and feed_previous=True, each + fed previous output will first be multiplied by W and added B. + feed_previous: Boolean or scalar Boolean Tensor; if True, only the first + of decoder_inputs will be used (the "GO" symbol), and all other decoder + inputs will be taken from previous outputs (as in embedding_rnn_decoder). + If False, decoder_inputs are used as given (the standard decoder case). + dtype: The dtype of the initial RNN state (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_attention_seq2seq". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x num_decoder_symbols] containing the generated + outputs. + state: The state of each decoder cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + """ + with variable_scope.variable_scope( + scope or "embedding_attention_seq2seq", dtype=dtype) as scope: + dtype = scope.dtype + # Encoder. + encoder_cell = core_rnn_cell.EmbeddingWrapper( cell, - num_decoder_symbols, - embedding_size, - num_heads=num_heads, - output_size=output_size, - output_projection=output_projection, - feed_previous=feed_previous_bool, - update_embedding_for_previous=False, - initial_state_attention=initial_state_attention) - state_list = [state] - if nest.is_sequence(state): - state_list = nest.flatten(state) - return outputs + state_list - - outputs_and_state = control_flow_ops.cond(feed_previous, - lambda: decoder(True), - lambda: decoder(False)) - outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. - state_list = outputs_and_state[outputs_len:] - state = state_list[0] - if nest.is_sequence(encoder_state): - state = nest.pack_sequence_as( - structure=encoder_state, flat_sequence=state_list) - return outputs_and_state[:outputs_len], state + embedding_classes=num_encoder_symbols, + embedding_size=embedding_size) + encoder_outputs, encoder_state = core_rnn.static_rnn( + encoder_cell, encoder_inputs, dtype=dtype) + + # First calculate a concatenation of encoder outputs to put attention on. + top_states = [ + array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs + ] + attention_states = array_ops.concat(top_states, 1) + + # Decoder. + output_size = None + if output_projection is None: + cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) + output_size = num_decoder_symbols + + if isinstance(feed_previous, bool): + return embedding_attention_decoder( + decoder_inputs, + encoder_state, + attention_states, + cell, + num_decoder_symbols, + embedding_size, + num_heads=num_heads, + output_size=output_size, + output_projection=output_projection, + feed_previous=feed_previous, + initial_state_attention=initial_state_attention) + + else: + # If feed_previous is a Tensor, we construct 2 graphs and use cond. + def decoder(feed_previous_bool): + reuse = None if feed_previous_bool else True + with variable_scope.variable_scope( + variable_scope.get_variable_scope(), reuse=reuse) as scope: + outputs, state = embedding_attention_decoder( + decoder_inputs, + encoder_state, + attention_states, + cell, + num_decoder_symbols, + embedding_size, + num_heads=num_heads, + output_size=output_size, + output_projection=output_projection, + feed_previous=feed_previous_bool, + update_embedding_for_previous=False, + initial_state_attention=initial_state_attention) + state_list = [state] + if nest.is_sequence(state): + state_list = nest.flatten(state) + return outputs + state_list + + outputs_and_state = control_flow_ops.cond(feed_previous, + lambda: decoder(True), + lambda: decoder(False)) + outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. + state_list = outputs_and_state[outputs_len:] + state = state_list[0] + if nest.is_sequence(encoder_state): + state = nest.pack_sequence_as( + structure=encoder_state, flat_sequence=state_list) + return outputs_and_state[:outputs_len], state + def embedding_attention_decoder(decoder_inputs, initial_state, @@ -181,76 +185,77 @@ def embedding_attention_decoder(decoder_inputs, dtype=None, scope=None, initial_state_attention=False): - """RNN decoder with embedding and attention and a pure-decoding option. - - Args: - decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: core_rnn_cell.RNNCell defining the cell function. - num_symbols: Integer, how many symbols come into the embedding. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_size: Size of the output vectors; if None, use output_size. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_symbols] and B has shape - [num_symbols]; if provided and feed_previous=True, each fed previous - output will first be multiplied by W and added B. - feed_previous: Boolean; if True, only the first of decoder_inputs will be - used (the "GO" symbol), and all other decoder inputs will be generated by: - next = embedding_lookup(embedding, argmax(previous_output)), - In effect, this implements a greedy decoder. It can also be used - during training to emulate http://arxiv.org/abs/1506.03099. - If False, decoder_inputs are used as given (the standard decoder case). - update_embedding_for_previous: Boolean; if False and feed_previous=True, - only the embedding for the first symbol of decoder_inputs (the "GO" - symbol) will be updated by back propagation. Embeddings for the symbols - generated from the decoder itself remain unchanged. This parameter has - no effect if feed_previous=False. - dtype: The dtype to use for the RNN initial states (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: When output_projection has the wrong shape. - """ - if output_size is None: - output_size = cell.output_size - if output_projection is not None: - proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) - proj_biases.get_shape().assert_is_compatible_with([num_symbols]) - - with variable_scope.variable_scope( - scope or "embedding_attention_decoder", dtype=dtype) as scope: - - embedding = variable_scope.get_variable("embedding", - [num_symbols, embedding_size]) - loop_function = _extract_sample_and_embed( - embedding, output_projection, - update_embedding_for_previous) if feed_previous else None - emb_inp = [ - embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs - ] - return attention_decoder( - emb_inp, - initial_state, - attention_states, - cell, - output_size=output_size, - num_heads=num_heads, - loop_function=loop_function, - initial_state_attention=initial_state_attention) + """RNN decoder with embedding and attention and a pure-decoding option. + + Args: + decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). + initial_state: 2D Tensor [batch_size x cell.state_size]. + attention_states: 3D Tensor [batch_size x attn_length x attn_size]. + cell: core_rnn_cell.RNNCell defining the cell function. + num_symbols: Integer, how many symbols come into the embedding. + embedding_size: Integer, the length of the embedding vector for each symbol. + num_heads: Number of attention heads that read from attention_states. + output_size: Size of the output vectors; if None, use output_size. + output_projection: None or a pair (W, B) of output projection weights and + biases; W has shape [output_size x num_symbols] and B has shape + [num_symbols]; if provided and feed_previous=True, each fed previous + output will first be multiplied by W and added B. + feed_previous: Boolean; if True, only the first of decoder_inputs will be + used (the "GO" symbol), and all other decoder inputs will be generated by: + next = embedding_lookup(embedding, argmax(previous_output)), + In effect, this implements a greedy decoder. It can also be used + during training to emulate http://arxiv.org/abs/1506.03099. + If False, decoder_inputs are used as given (the standard decoder case). + update_embedding_for_previous: Boolean; if False and feed_previous=True, + only the embedding for the first symbol of decoder_inputs (the "GO" + symbol) will be updated by back propagation. Embeddings for the symbols + generated from the decoder itself remain unchanged. This parameter has + no effect if feed_previous=False. + dtype: The dtype to use for the RNN initial states (default: tf.float32). + scope: VariableScope for the created subgraph; defaults to + "embedding_attention_decoder". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states -- useful when we wish to resume decoding from a previously + stored decoder state and attention states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors with + shape [batch_size x output_size] containing the generated outputs. + state: The state of each decoder cell at the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: When output_projection has the wrong shape. + """ + if output_size is None: + output_size = cell.output_size + if output_projection is not None: + proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) + proj_biases.get_shape().assert_is_compatible_with([num_symbols]) + + with variable_scope.variable_scope( + scope or "embedding_attention_decoder", dtype=dtype) as scope: + + embedding = variable_scope.get_variable("embedding", + [num_symbols, embedding_size]) + loop_function = _extract_sample_and_embed( + embedding, output_projection, + update_embedding_for_previous) if feed_previous else None + emb_inp = [ + embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs + ] + return attention_decoder( + emb_inp, + initial_state, + attention_states, + cell, + output_size=output_size, + num_heads=num_heads, + loop_function=loop_function, + initial_state_attention=initial_state_attention) + def attention_decoder(decoder_inputs, initial_state, @@ -262,154 +267,154 @@ def attention_decoder(decoder_inputs, dtype=None, scope=None, initial_state_attention=False): - """RNN decoder with attention for the sequence-to-sequence model. - - In this context "attention" means that, during decoding, the RNN can look up - information in the additional tensor attention_states, and it does this by - focusing on a few entries from the tensor. This model has proven to yield - especially good results in a number of sequence-to-sequence tasks. This - implementation is based on http://arxiv.org/abs/1412.7449 (see below for - details). It is recommended for complex sequence-to-sequence tasks. - - Args: - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: core_rnn_cell.RNNCell defining the cell function and size. - output_size: Size of the output vectors; if None, we use cell.output_size. - num_heads: Number of attention heads that read from attention_states. - loop_function: If not None, this function will be applied to i-th output - in order to generate i+1-th input, and decoder_inputs will be ignored, - except for the first element ("GO" symbol). This can be used for decoding, - but also for training to emulate http://arxiv.org/abs/1506.03099. - Signature -- loop_function(prev, i) = next - * prev is a 2D Tensor of shape [batch_size x output_size], - * i is an integer, the step number (when advanced control is needed), - * next is a 2D Tensor of shape [batch_size x input_size]. - dtype: The dtype to use for the RNN initial state (default: tf.float32). - scope: VariableScope for the created subgraph; default: "attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors of - shape [batch_size x output_size]. These represent the generated outputs. - Output i is computed from input i (which is either the i-th element - of decoder_inputs or loop_function(output {i-1}, i)) as follows. - First, we run the cell on a combination of the input and previous - attention masks: - cell_output, new_state = cell(linear(input, prev_attn), prev_state). - Then, we calculate new attention masks: - new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) - and then we calculate the output: - output = linear(cell_output, new_attn). - state: The state of each decoder cell the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: when num_heads is not positive, there are no inputs, shapes - of attention_states are not set, or input size cannot be inferred - from the input. - """ - if not decoder_inputs: - raise ValueError("Must provide at least 1 input to attention decoder.") - if num_heads < 1: - raise ValueError("With less than 1 heads, use a non-attention decoder.") - if attention_states.get_shape()[2].value is None: - raise ValueError("Shape[2] of attention_states must be known: %s" % - attention_states.get_shape()) - if output_size is None: - output_size = cell.output_size - - with variable_scope.variable_scope( - scope or "attention_decoder", dtype=dtype) as scope: - dtype = scope.dtype - - batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. - attn_length = attention_states.get_shape()[1].value - if attn_length is None: - attn_length = array_ops.shape(attention_states)[1] - attn_size = attention_states.get_shape()[2].value - - # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. - hidden = array_ops.reshape(attention_states, - [-1, attn_length, 1, attn_size]) - hidden_features = [] - v = [] - attention_vec_size = attn_size # Size of query vectors for attention. - for a in xrange(num_heads): - k = variable_scope.get_variable("AttnW_%d" % a, - [1, 1, attn_size, attention_vec_size]) - hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) - v.append( - variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) - - state = initial_state - - def attention(query): - """Put attention masks on hidden using hidden_features and query.""" - ds = [] # Results of attention reads will be stored here. - if nest.is_sequence(query): # If the query is a tuple, flatten it. - query_list = nest.flatten(query) - for q in query_list: # Check that ndims == 2 if specified. - ndims = q.get_shape().ndims - if ndims: - assert ndims == 2 - query = array_ops.concat(query_list, 1) - for a in xrange(num_heads): - with variable_scope.variable_scope("Attention_%d" % a): - y = linear(query, attention_vec_size, True) - y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) - # Attention mask is a softmax of v^T * tanh(...). - s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), - [2, 3]) - a = nn_ops.softmax(s) - # Now calculate the attention-weighted vector d. - d = math_ops.reduce_sum( - array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) - ds.append(array_ops.reshape(d, [-1, attn_size])) - return ds - - outputs = [] - prev = None - batch_attn_size = array_ops.stack([batch_size, attn_size]) - attns = [ - array_ops.zeros( - batch_attn_size, dtype=dtype) for _ in xrange(num_heads) - ] - for a in attns: # Ensure the second shape of attention vectors is set. - a.set_shape([None, attn_size]) - if initial_state_attention: - attns = attention(initial_state) - for i, inp in enumerate(decoder_inputs): - if i > 0: - variable_scope.get_variable_scope().reuse_variables() - # If loop_function is set, we use it instead of decoder_inputs. - if loop_function is not None and prev is not None: - with variable_scope.variable_scope("loop_function", reuse=True): - inp = loop_function(prev, i) - # Merge input and previous attentions into one vector of the right size. - input_size = inp.get_shape().with_rank(2)[1] - if input_size.value is None: - raise ValueError("Could not infer input size from input: %s" % inp.name) - x = linear([inp] + attns, input_size, True) - # Run the RNN. - cell_output, state = cell(x, state) - # Run the attention mechanism. - if i == 0 and initial_state_attention: - with variable_scope.variable_scope( - variable_scope.get_variable_scope(), reuse=True): - attns = attention(state) - else: - attns = attention(state) - - with variable_scope.variable_scope("AttnOutputProjection"): - output = linear([cell_output] + attns, output_size, True) - if loop_function is not None: - prev = output - outputs.append(output) - - return outputs, state \ No newline at end of file + """RNN decoder with attention for the sequence-to-sequence model. + + In this context "attention" means that, during decoding, the RNN can look up + information in the additional tensor attention_states, and it does this by + focusing on a few entries from the tensor. This model has proven to yield + especially good results in a number of sequence-to-sequence tasks. This + implementation is based on http://arxiv.org/abs/1412.7449 (see below for + details). It is recommended for complex sequence-to-sequence tasks. + + Args: + decoder_inputs: A list of 2D Tensors [batch_size x input_size]. + initial_state: 2D Tensor [batch_size x cell.state_size]. + attention_states: 3D Tensor [batch_size x attn_length x attn_size]. + cell: core_rnn_cell.RNNCell defining the cell function and size. + output_size: Size of the output vectors; if None, we use cell.output_size. + num_heads: Number of attention heads that read from attention_states. + loop_function: If not None, this function will be applied to i-th output + in order to generate i+1-th input, and decoder_inputs will be ignored, + except for the first element ("GO" symbol). This can be used for decoding, + but also for training to emulate http://arxiv.org/abs/1506.03099. + Signature -- loop_function(prev, i) = next + * prev is a 2D Tensor of shape [batch_size x output_size], + * i is an integer, the step number (when advanced control is needed), + * next is a 2D Tensor of shape [batch_size x input_size]. + dtype: The dtype to use for the RNN initial state (default: tf.float32). + scope: VariableScope for the created subgraph; default: "attention_decoder". + initial_state_attention: If False (default), initial attentions are zero. + If True, initialize the attentions from the initial state and attention + states -- useful when we wish to resume decoding from a previously + stored decoder state and attention states. + + Returns: + A tuple of the form (outputs, state), where: + outputs: A list of the same length as decoder_inputs of 2D Tensors of + shape [batch_size x output_size]. These represent the generated outputs. + Output i is computed from input i (which is either the i-th element + of decoder_inputs or loop_function(output {i-1}, i)) as follows. + First, we run the cell on a combination of the input and previous + attention masks: + cell_output, new_state = cell(linear(input, prev_attn), prev_state). + Then, we calculate new attention masks: + new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) + and then we calculate the output: + output = linear(cell_output, new_attn). + state: The state of each decoder cell the final time-step. + It is a 2D Tensor of shape [batch_size x cell.state_size]. + + Raises: + ValueError: when num_heads is not positive, there are no inputs, shapes + of attention_states are not set, or input size cannot be inferred + from the input. + """ + if not decoder_inputs: + raise ValueError("Must provide at least 1 input to attention decoder.") + if num_heads < 1: + raise ValueError("With less than 1 heads, use a non-attention decoder.") + if attention_states.get_shape()[2].value is None: + raise ValueError("Shape[2] of attention_states must be known: %s" % + attention_states.get_shape()) + if output_size is None: + output_size = cell.output_size + + with variable_scope.variable_scope( + scope or "attention_decoder", dtype=dtype) as scope: + dtype = scope.dtype + + batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. + attn_length = attention_states.get_shape()[1].value + if attn_length is None: + attn_length = array_ops.shape(attention_states)[1] + attn_size = attention_states.get_shape()[2].value + + # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. + hidden = array_ops.reshape(attention_states, + [-1, attn_length, 1, attn_size]) + hidden_features = [] + v = [] + attention_vec_size = attn_size # Size of query vectors for attention. + for a in xrange(num_heads): + k = variable_scope.get_variable("AttnW_%d" % a, + [1, 1, attn_size, attention_vec_size]) + hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) + v.append( + variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) + + state = initial_state + + def attention(query): + """Put attention masks on hidden using hidden_features and query.""" + ds = [] # Results of attention reads will be stored here. + if nest.is_sequence(query): # If the query is a tuple, flatten it. + query_list = nest.flatten(query) + for q in query_list: # Check that ndims == 2 if specified. + ndims = q.get_shape().ndims + if ndims: + assert ndims == 2 + query = array_ops.concat(query_list, 1) + for a in xrange(num_heads): + with variable_scope.variable_scope("Attention_%d" % a): + y = linear(query, attention_vec_size, True) + y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) + # Attention mask is a softmax of v^T * tanh(...). + s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), + [2, 3]) + a = nn_ops.softmax(s) + # Now calculate the attention-weighted vector d. + d = math_ops.reduce_sum( + array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) + ds.append(array_ops.reshape(d, [-1, attn_size])) + return ds + + outputs = [] + prev = None + batch_attn_size = array_ops.stack([batch_size, attn_size]) + attns = [ + array_ops.zeros( + batch_attn_size, dtype=dtype) for _ in xrange(num_heads) + ] + for a in attns: # Ensure the second shape of attention vectors is set. + a.set_shape([None, attn_size]) + if initial_state_attention: + attns = attention(initial_state) + for i, inp in enumerate(decoder_inputs): + if i > 0: + variable_scope.get_variable_scope().reuse_variables() + # If loop_function is set, we use it instead of decoder_inputs. + if loop_function is not None and prev is not None: + with variable_scope.variable_scope("loop_function", reuse=True): + inp = loop_function(prev, i) + # Merge input and previous attentions into one vector of the right size. + input_size = inp.get_shape().with_rank(2)[1] + if input_size.value is None: + raise ValueError("Could not infer input size from input: %s" % inp.name) + x = linear([inp] + attns, input_size, True) + # Run the RNN. + cell_output, state = cell(x, state) + # Run the attention mechanism. + if i == 0 and initial_state_attention: + with variable_scope.variable_scope( + variable_scope.get_variable_scope(), reuse=True): + attns = attention(state) + else: + attns = attention(state) + + with variable_scope.variable_scope("AttnOutputProjection"): + output = linear([cell_output] + attns, output_size, True) + if loop_function is not None: + prev = output + outputs.append(output) + + return outputs, state diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py index 0c11bbd..cb976ee 100644 --- a/tf_chatbot/lib/data_utils.py +++ b/tf_chatbot/lib/data_utils.py @@ -70,7 +70,7 @@ def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} - data = json.load(open(data_path, encoding=_ENCODING)) + data = json.load(open(data_path), encoding=_ENCODING) counter = 0 for ((q,qe),(a,ae)) in data: counter += 1 @@ -122,8 +122,8 @@ def sentence_to_token_ids(sentence, vocabulary, else: words = basic_tokenizer(sentence) if not normalize_digits: - return [vocabulary.get(w, UNK_ID) for w in words] - return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words] + return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words] + return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words] def data_to_token_ids_bak(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): @@ -147,7 +147,7 @@ def data_to_token_ids(data_path, target_path, vocabulary_path, print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(target_path, mode='w') as tokens_file: - data = json.load(open(data_path, encoding=_ENCODING)) + data = json.load(open(data_path), encoding=_ENCODING) counter = 0 for ((q,qe),(a,ae)) in data: counter += 1 diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index 01ecf44..00b23f7 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -9,7 +9,7 @@ def predict(): def _get_test_dataset(): - data = json.load(open(TEST_DATASET_PATH, encoding=data_utils._ENCODING)) + data = json.load(open(TEST_DATASET_PATH)) test_sentences = [q for ((q, qe), _) in data] return test_sentences @@ -27,7 +27,8 @@ def _get_test_dataset(): test_dataset = _get_test_dataset() for sentence in test_dataset: - predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess) - print(sentence, '->', predicted_sentence) + predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess, use_beam_search=True) + print(sentence, '->') + print(predicted_sentence) results_fh.write(predicted_sentence + '\n') \ No newline at end of file diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index f0fa16b..61ecde3 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -9,9 +9,10 @@ import tensorflow as tf import tf_chatbot.lib.data_utils as data_utils -from tensorflow.contrib.legacy_seq2seq import model_with_buckets, embedding_attention_seq2seq +from tensorflow.contrib.legacy_seq2seq import model_with_buckets, sequence_loss, embedding_attention_decoder from tf_chatbot.lib.basic.advanced_seq2seq import embedding_attention_sampled_seq2seq -from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell +from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper + class Seq2SeqModel(object): """Sequence-to-sequence model with attention and for multiple buckets. @@ -40,6 +41,7 @@ def __init__(self, use_lstm=False, num_samples=512, forward_only=False, + beam_search_size=1, dtype=tf.float32): """Create the model. Args: @@ -72,6 +74,7 @@ def __init__(self, self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) + self.beam_search_size = beam_search_size # If we use sampled softmax, we need an output projection. output_projection = None @@ -113,8 +116,65 @@ def single_cell(): if num_layers > 1: cell = MultiRNNCell([single_cell() for _ in range(num_layers)]) + self.model_encoder_states = {} + self.model_attention_states = {} + self.topk_probs = [] + self.topk_ids = [] + # The seq2seq function: we use embedding for the input and attention. - def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): + def seq2seq_f(encoder_inputs, decoder_inputs, do_decode, bucket_id): + + def embedding_attention_sampled_seq2seq(encoder_inputs, + decoder_inputs, + cell, + num_encoder_symbols, + num_decoder_symbols, + embedding_size, + bucket_index, + num_heads=1, + output_projection=None, + feed_previous=False, + initial_state_attention=False, + dtype=tf.float32): + with tf.variable_scope("embedding_attention_sampled_seq2seq"): + encoder_cell = EmbeddingWrapper( + cell, + embedding_classes=num_encoder_symbols, + embedding_size=embedding_size + ) + encoder_outputs, encoder_state = static_rnn( + encoder_cell, encoder_inputs, dtype=dtype) + + top_states = [ + tf.reshape(e, [-1,1, cell.output_size]) for e in encoder_outputs + ] + attention_states = tf.concat(top_states, 1) + + self.model_encoder_states[bucket_index] = encoder_state + self.model_attention_states[bucket_index] = attention_states + + output_size = None + if output_projection is None: + cell = OutputProjectionWrapper(cell, num_decoder_symbols) + output_size = num_decoder_symbols + + if isinstance(feed_previous, bool): + return embedding_attention_decoder( + decoder_inputs, + self.model_encoder_states[bucket_index], #encoder_state, + self.model_attention_states[bucket_index], #attention_states, + cell, + num_decoder_symbols, + embedding_size, + num_heads=num_heads, + output_size=output_size, + output_projection=output_projection, + feed_previous=feed_previous, + initial_state_attention=initial_state_attention) + + else: + raise NotImplementedError() + return embedding_attention_sampled_seq2seq( encoder_inputs, decoder_inputs, @@ -122,9 +182,9 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, + bucket_index=bucket_id, output_projection=output_projection, - feed_previous=do_decode, - dtype=dtype) + feed_previous=do_decode) # Feeds for inputs. self.encoder_inputs = [] @@ -143,25 +203,43 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): targets = [self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1)] - # Training outputs and losses. - if forward_only: - self.outputs, self.losses = model_with_buckets( - self.encoder_inputs, self.decoder_inputs, targets, - self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), - softmax_loss_function=softmax_loss_function) - # If we use output projection, we need to project outputs for decoding. - if output_projection is not None: - for b in range(len(buckets)): - self.outputs[b] = [ - tf.matmul(output, output_projection[0]) + output_projection[1] - for output in self.outputs[b] - ] - else: - self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( - self.encoder_inputs, self.decoder_inputs, targets, - self.target_weights, buckets, - lambda x, y: seq2seq_f(x, y, False), - softmax_loss_function=softmax_loss_function) + with tf.variable_scope("model_with_buckets"): + self.losses = [] + self.outputs = [] + self.decoder_out_state = [] + for bucket_idx, bucket in enumerate(buckets): + with tf.variable_scope(tf.get_variable_scope(), reuse=True if bucket_idx > 0 else None): + if forward_only: + bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]], + self.decoder_inputs[:bucket[1]], + True, + bucket_idx) + else: + bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]], + self.decoder_inputs[:bucket[1]], + False, + bucket_idx) + self.outputs.append(bucket_outputs) + self.decoder_out_state.append(bucket_outputs_state) + self.losses.append( + sequence_loss( + self.outputs[-1], + targets[:bucket[1]], + self.target_weights[:bucket[1]], + softmax_loss_function=softmax_loss_function + ) + ) + + if forward_only and output_projection is not None: + for b in range(len(buckets)): + self.outputs[b] = [ + tf.matmul(output, output_projection[0]) + output_projection[1] + for output in self.outputs[b]] + #best_outputs = [tf.argmax(x,1) for x in self.outputs[b]] + #best_outputs = tf.concat(axis=1, values=[tf.reshape(x, [self.batch_size, 1]) for x in best_outputs]) + _topk_log_probs, _topk_ids = tf.nn.top_k(tf.nn.softmax(self.outputs[b][-1]), beam_search_size) + self.topk_probs.append(_topk_log_probs) + self.topk_ids.append(_topk_ids) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() @@ -179,24 +257,84 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): self.saver = tf.train.Saver(tf.global_variables()) + def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weights, + bucket_id, forward_only, use_beam_search=True): + encoder_size, decoder_size = self.buckets[bucket_id] + if len(encoder_inputs) != encoder_size: + raise ValueError("Encoder length must be equal to the one in bucket," + " %d != %d." % (len(encoder_inputs), encoder_size)) + if len(decoder_inputs) != decoder_size: + raise ValueError("Decoder length must be equal to the one in bucket," + " %d != %d." % (len(decoder_inputs), decoder_size)) + if len(target_weights) != decoder_size: + raise ValueError("Weights length must be equal to the one in bucket," + " %d != %d." % (len(target_weights), decoder_size)) + + input_feed = {} + for l in range(encoder_size): + input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] + for l in range(decoder_size): + input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] + input_feed[self.target_weights[l].name] = target_weights[l] + + last_target = self.decoder_inputs[decoder_size].name + input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) + + if not forward_only: + raise NotImplementedError("Not Implemented!!!") + else: + if use_beam_search: + output_feed = [self.model_attention_states[bucket_id], + self.model_encoder_states[bucket_id]] + outputs = session.run(output_feed, input_feed) # attention_state, encoder_state + beams = [(0.0, [data_utils.GO_ID], data_utils.GO_ID, outputs[1])] * 3 # score, result, last_token, encoder_state + result = [] + step = 0 + attention_state = outputs[0] + + while step < decoder_size and len(result) < self.beam_search_size: + step += 1 + _last_tokens = [beam_[2] for beam_ in beams] + _encoder_state = [beam_[3] for beam_ in beams] + output_feed = [self.topk_ids[bucket_id], self.topk_probs[bucket_id], self.decoder_out_state[bucket_id]] + input_feed = {} + input_feed[self.model_attention_states[bucket_id].name] = attention_state + input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state)) + for l in range(step): + _decoder_inputs = [beam_[1][l] for beam_ in beams] + input_feed[self.decoder_inputs[l].name] = _decoder_inputs + + _tok_ids, _tok_probs, _out_states = session.run(output_feed, input_feed) + + new_beams = [] + + for beam_idx in range(self.beam_search_size): + for _idx in range(self.beam_search_size): + new_beams.append((beams[beam_idx][0]+_tok_probs[beam_idx][_idx], beams[beam_idx][1]+[_tok_ids[beam_idx][_idx]], _tok_ids[beam_idx][_idx], _out_states[beam_idx])) + + new_beams.sort(key=lambda x: x[0], reverse=True) + beams = [] + for beam_ in new_beams: + if beam_[2] == data_utils.EOS_ID: + result.append((beam_[0],beam_[1][:-1],beam_[2],beam_[3])) + else: + beams.append(beam_) + if len(beams) == self.beam_search_size: + break + + if step == decoder_size: + for beam_ in beams: + result.append(beam_) + if len(result) == self.beam_search_size: + break + return None, None, result[0][1] + + else: + raise NotImplementedError("Not Implemented!!!") + + def step(self, session, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only): - """Run a step of the model feeding the given inputs. - Args: - session: tensorflow session to use. - encoder_inputs: list of numpy int vectors to feed as encoder inputs. - decoder_inputs: list of numpy int vectors to feed as decoder inputs. - target_weights: list of numpy float vectors to feed as target weights. - bucket_id: which bucket of the model to use. - forward_only: whether to do the backward step or only forward. - Returns: - A triple consisting of gradient norm (or None if we did not do backward), - average perplexity, and the outputs. - Raises: - ValueError: if length of encoder_inputs, decoder_inputs, or - target_weights disagrees with bucket size for the specified bucket_id. - """ - # Check if the sizes match. encoder_size, decoder_size = self.buckets[bucket_id] if len(encoder_inputs) != encoder_size: raise ValueError("Encoder length must be equal to the one in bucket," @@ -208,7 +346,6 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights, raise ValueError("Weights length must be equal to the one in bucket," " %d != %d." % (len(target_weights), decoder_size)) - # Input feed: encoder inputs, decoder inputs, target_weights, as provided. input_feed = {} for l in range(encoder_size): input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] @@ -216,11 +353,9 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights, input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] input_feed[self.target_weights[l].name] = target_weights[l] - # Since our targets are decoder inputs shifted by one, we need one more. last_target = self.decoder_inputs[decoder_size].name input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) - # Output feed: depends on whether we do a backward step or not. if not forward_only: output_feed = [self.updates[bucket_id], # Update Op that does SGD. self.gradient_norms[bucket_id], # Gradient norm. @@ -231,61 +366,41 @@ def step(self, session, encoder_inputs, decoder_inputs, target_weights, output_feed.append(self.outputs[bucket_id][l]) outputs = session.run(output_feed, input_feed) + if not forward_only: return outputs[1], outputs[2], None # Gradient norm, loss, no outputs. else: return None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. def get_batch(self, data, bucket_id): - """Get a random batch of data from the specified bucket, prepare for step. - To feed data in step(..) it must be a list of batch-major vectors, while - data here contains single length-major cases. So the main logic of this - function is to re-index data cases to be in the proper format for feeding. - Args: - data: a tuple of size len(self.buckets) in which each element contains - lists of pairs of input and output data that we use to create a batch. - bucket_id: integer, which bucket to get the batch for. - Returns: - The triple (encoder_inputs, decoder_inputs, target_weights) for - the constructed batch that has the proper format to call step(...) later. - """ + encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] - # Get a random batch of encoder and decoder inputs from data, - # pad them if needed, reverse encoder inputs and add GO to decoder. for _ in range(self.batch_size): encoder_input, decoder_input = random.choice(data[bucket_id]) - # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) - # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) - # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] - # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in range(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in range(self.batch_size)], dtype=np.int32)) - # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in range(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in range(self.batch_size)], dtype=np.int32)) - # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(self.batch_size, dtype=np.float32) for batch_idx in range(self.batch_size): - # We set weight to 0 if the corresponding target is a PAD symbol. - # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 0965ce9..79e8659 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -24,6 +24,7 @@ def create_model(session, forward_only): learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, use_lstm=False, + beam_search_size=FLAGS.beam_search_size, forward_only=forward_only) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) @@ -37,7 +38,7 @@ def create_model(session, forward_only): session.run(tf.global_variables_initializer()) return model -def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess): +def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_beam_search=False): input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) bucket_id = min([b for b in range(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)]) @@ -46,17 +47,20 @@ def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess): feed_data = {bucket_id: [(input_token_ids, outputs)]} encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id) - _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) + if use_beam_search: + _, _, output_words = model.step_beam_search(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) + outputs = output_words[1:] + output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs]) - outputs = [] - - for logit in output_logits: - selected_token_id = int(np.argmax(logit, axis=1)) - if selected_token_id == data_utils.EOS_ID: - break - else: - outputs.append(selected_token_id) - - output_sentence = ' '.join([rev_vocab[output] for output in outputs]) + else: + _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) + outputs = [] + for logit in output_logits: + selected_token_id = int(np.argmax(logit, axis=1)) + if selected_token_id == data_utils.EOS_ID: + break + else: + outputs.append(selected_token_id) + output_sentence = ' '.join([rev_vocab[output] for output in outputs]) return output_sentence \ No newline at end of file From 7402ce99794c22c4514e8738a654667e0625aced Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Fri, 12 May 2017 15:59:51 +0800 Subject: [PATCH 02/14] Convert to PEP8 format. --- tf_chatbot/lib/basic/advanced_seq2seq.py | 420 ----------------------- tf_chatbot/lib/chat.py | 11 +- tf_chatbot/lib/data_utils.py | 70 ++-- tf_chatbot/lib/predict.py | 14 +- tf_chatbot/lib/seq2seq_model.py | 314 +++++++++-------- tf_chatbot/lib/seq2seq_model_utils.py | 31 +- tf_chatbot/lib/train.py | 49 ++- 7 files changed, 303 insertions(+), 606 deletions(-) delete mode 100644 tf_chatbot/lib/basic/advanced_seq2seq.py diff --git a/tf_chatbot/lib/basic/advanced_seq2seq.py b/tf_chatbot/lib/basic/advanced_seq2seq.py deleted file mode 100644 index 9812072..0000000 --- a/tf_chatbot/lib/basic/advanced_seq2seq.py +++ /dev/null @@ -1,420 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -# We disable pylint because we need python3 compatibility. -# from six.moves import xrange # pylint: disable=redefined-builtin -# from six.moves import zip # pylint: disable=redefined-builtin - -from tensorflow.contrib.rnn.python.ops import core_rnn -from tensorflow.contrib.rnn.python.ops import core_rnn_cell -from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import embedding_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import nn_ops -from tensorflow.python.ops import variable_scope -from tensorflow.python.util import nest -from tensorflow import multinomial, squeeze -import tensorflow as tf - -# TODO(ebrevdo): Remove once _linear is fully deprecated. -linear = core_rnn_cell_impl._linear # pylint: disable=protected-access - - -def _extract_sample_and_embed(embedding, - output_projection=None, - update_embedding=True): - def loop_function(prev, _): - if output_projection is not None: - prev = nn_ops.xw_plus_b(prev, output_projection[0], output_projection[1]) - # prev_symbol = math_ops.argmax(prev, 1) - prev_symbol = squeeze(multinomial(prev, 1), axis=1) - emb_prev = embedding_ops.embedding_lookup(embedding, prev_symbol) - if not update_embedding: - emb_prev = array_ops.stop_gradient(emb_prev) - return emb_prev - - return loop_function - - -def embedding_attention_sampled_seq2seq(encoder_inputs, - decoder_inputs, - cell, - num_encoder_symbols, - num_decoder_symbols, - embedding_size, - num_heads=1, - output_projection=None, - feed_previous=False, - dtype=None, - scope=None, - initial_state_attention=False, - batch_size=None): - """Embedding sequence-to-sequence model with attention. - - This model first embeds encoder_inputs by a newly created embedding (of shape - [num_encoder_symbols x input_size]). Then it runs an RNN to encode - embedded encoder_inputs into a state vector. It keeps the outputs of this - RNN at every step to use for attention later. Next, it embeds decoder_inputs - by another newly created embedding (of shape [num_decoder_symbols x - input_size]). Then it runs attention decoder, initialized with the last - encoder state, on embedded decoder_inputs and attending to encoder outputs. - - Warning: when output_projection is None, the size of the attention vectors - and variables will be made proportional to num_decoder_symbols, can be large. - - Args: - encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. - cell: core_rnn_cell.RNNCell defining the cell function and size. - num_encoder_symbols: Integer; number of symbols on the encoder side. - num_decoder_symbols: Integer; number of symbols on the decoder side. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_decoder_symbols] and B has - shape [num_decoder_symbols]; if provided and feed_previous=True, each - fed previous output will first be multiplied by W and added B. - feed_previous: Boolean or scalar Boolean Tensor; if True, only the first - of decoder_inputs will be used (the "GO" symbol), and all other decoder - inputs will be taken from previous outputs (as in embedding_rnn_decoder). - If False, decoder_inputs are used as given (the standard decoder case). - dtype: The dtype of the initial RNN state (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_seq2seq". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x num_decoder_symbols] containing the generated - outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - """ - with variable_scope.variable_scope( - scope or "embedding_attention_seq2seq", dtype=dtype) as scope: - dtype = scope.dtype - # Encoder. - encoder_cell = core_rnn_cell.EmbeddingWrapper( - cell, - embedding_classes=num_encoder_symbols, - embedding_size=embedding_size) - encoder_outputs, encoder_state = core_rnn.static_rnn( - encoder_cell, encoder_inputs, dtype=dtype) - - # First calculate a concatenation of encoder outputs to put attention on. - top_states = [ - array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs - ] - attention_states = array_ops.concat(top_states, 1) - - # Decoder. - output_size = None - if output_projection is None: - cell = core_rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) - output_size = num_decoder_symbols - - if isinstance(feed_previous, bool): - return embedding_attention_decoder( - decoder_inputs, - encoder_state, - attention_states, - cell, - num_decoder_symbols, - embedding_size, - num_heads=num_heads, - output_size=output_size, - output_projection=output_projection, - feed_previous=feed_previous, - initial_state_attention=initial_state_attention) - - else: - # If feed_previous is a Tensor, we construct 2 graphs and use cond. - def decoder(feed_previous_bool): - reuse = None if feed_previous_bool else True - with variable_scope.variable_scope( - variable_scope.get_variable_scope(), reuse=reuse) as scope: - outputs, state = embedding_attention_decoder( - decoder_inputs, - encoder_state, - attention_states, - cell, - num_decoder_symbols, - embedding_size, - num_heads=num_heads, - output_size=output_size, - output_projection=output_projection, - feed_previous=feed_previous_bool, - update_embedding_for_previous=False, - initial_state_attention=initial_state_attention) - state_list = [state] - if nest.is_sequence(state): - state_list = nest.flatten(state) - return outputs + state_list - - outputs_and_state = control_flow_ops.cond(feed_previous, - lambda: decoder(True), - lambda: decoder(False)) - outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. - state_list = outputs_and_state[outputs_len:] - state = state_list[0] - if nest.is_sequence(encoder_state): - state = nest.pack_sequence_as( - structure=encoder_state, flat_sequence=state_list) - return outputs_and_state[:outputs_len], state - - -def embedding_attention_decoder(decoder_inputs, - initial_state, - attention_states, - cell, - num_symbols, - embedding_size, - num_heads=1, - output_size=None, - output_projection=None, - feed_previous=False, - update_embedding_for_previous=True, - dtype=None, - scope=None, - initial_state_attention=False): - """RNN decoder with embedding and attention and a pure-decoding option. - - Args: - decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: core_rnn_cell.RNNCell defining the cell function. - num_symbols: Integer, how many symbols come into the embedding. - embedding_size: Integer, the length of the embedding vector for each symbol. - num_heads: Number of attention heads that read from attention_states. - output_size: Size of the output vectors; if None, use output_size. - output_projection: None or a pair (W, B) of output projection weights and - biases; W has shape [output_size x num_symbols] and B has shape - [num_symbols]; if provided and feed_previous=True, each fed previous - output will first be multiplied by W and added B. - feed_previous: Boolean; if True, only the first of decoder_inputs will be - used (the "GO" symbol), and all other decoder inputs will be generated by: - next = embedding_lookup(embedding, argmax(previous_output)), - In effect, this implements a greedy decoder. It can also be used - during training to emulate http://arxiv.org/abs/1506.03099. - If False, decoder_inputs are used as given (the standard decoder case). - update_embedding_for_previous: Boolean; if False and feed_previous=True, - only the embedding for the first symbol of decoder_inputs (the "GO" - symbol) will be updated by back propagation. Embeddings for the symbols - generated from the decoder itself remain unchanged. This parameter has - no effect if feed_previous=False. - dtype: The dtype to use for the RNN initial states (default: tf.float32). - scope: VariableScope for the created subgraph; defaults to - "embedding_attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors with - shape [batch_size x output_size] containing the generated outputs. - state: The state of each decoder cell at the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: When output_projection has the wrong shape. - """ - if output_size is None: - output_size = cell.output_size - if output_projection is not None: - proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) - proj_biases.get_shape().assert_is_compatible_with([num_symbols]) - - with variable_scope.variable_scope( - scope or "embedding_attention_decoder", dtype=dtype) as scope: - - embedding = variable_scope.get_variable("embedding", - [num_symbols, embedding_size]) - loop_function = _extract_sample_and_embed( - embedding, output_projection, - update_embedding_for_previous) if feed_previous else None - emb_inp = [ - embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs - ] - return attention_decoder( - emb_inp, - initial_state, - attention_states, - cell, - output_size=output_size, - num_heads=num_heads, - loop_function=loop_function, - initial_state_attention=initial_state_attention) - - -def attention_decoder(decoder_inputs, - initial_state, - attention_states, - cell, - output_size=None, - num_heads=1, - loop_function=None, - dtype=None, - scope=None, - initial_state_attention=False): - """RNN decoder with attention for the sequence-to-sequence model. - - In this context "attention" means that, during decoding, the RNN can look up - information in the additional tensor attention_states, and it does this by - focusing on a few entries from the tensor. This model has proven to yield - especially good results in a number of sequence-to-sequence tasks. This - implementation is based on http://arxiv.org/abs/1412.7449 (see below for - details). It is recommended for complex sequence-to-sequence tasks. - - Args: - decoder_inputs: A list of 2D Tensors [batch_size x input_size]. - initial_state: 2D Tensor [batch_size x cell.state_size]. - attention_states: 3D Tensor [batch_size x attn_length x attn_size]. - cell: core_rnn_cell.RNNCell defining the cell function and size. - output_size: Size of the output vectors; if None, we use cell.output_size. - num_heads: Number of attention heads that read from attention_states. - loop_function: If not None, this function will be applied to i-th output - in order to generate i+1-th input, and decoder_inputs will be ignored, - except for the first element ("GO" symbol). This can be used for decoding, - but also for training to emulate http://arxiv.org/abs/1506.03099. - Signature -- loop_function(prev, i) = next - * prev is a 2D Tensor of shape [batch_size x output_size], - * i is an integer, the step number (when advanced control is needed), - * next is a 2D Tensor of shape [batch_size x input_size]. - dtype: The dtype to use for the RNN initial state (default: tf.float32). - scope: VariableScope for the created subgraph; default: "attention_decoder". - initial_state_attention: If False (default), initial attentions are zero. - If True, initialize the attentions from the initial state and attention - states -- useful when we wish to resume decoding from a previously - stored decoder state and attention states. - - Returns: - A tuple of the form (outputs, state), where: - outputs: A list of the same length as decoder_inputs of 2D Tensors of - shape [batch_size x output_size]. These represent the generated outputs. - Output i is computed from input i (which is either the i-th element - of decoder_inputs or loop_function(output {i-1}, i)) as follows. - First, we run the cell on a combination of the input and previous - attention masks: - cell_output, new_state = cell(linear(input, prev_attn), prev_state). - Then, we calculate new attention masks: - new_attn = softmax(V^T * tanh(W * attention_states + U * new_state)) - and then we calculate the output: - output = linear(cell_output, new_attn). - state: The state of each decoder cell the final time-step. - It is a 2D Tensor of shape [batch_size x cell.state_size]. - - Raises: - ValueError: when num_heads is not positive, there are no inputs, shapes - of attention_states are not set, or input size cannot be inferred - from the input. - """ - if not decoder_inputs: - raise ValueError("Must provide at least 1 input to attention decoder.") - if num_heads < 1: - raise ValueError("With less than 1 heads, use a non-attention decoder.") - if attention_states.get_shape()[2].value is None: - raise ValueError("Shape[2] of attention_states must be known: %s" % - attention_states.get_shape()) - if output_size is None: - output_size = cell.output_size - - with variable_scope.variable_scope( - scope or "attention_decoder", dtype=dtype) as scope: - dtype = scope.dtype - - batch_size = array_ops.shape(decoder_inputs[0])[0] # Needed for reshaping. - attn_length = attention_states.get_shape()[1].value - if attn_length is None: - attn_length = array_ops.shape(attention_states)[1] - attn_size = attention_states.get_shape()[2].value - - # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. - hidden = array_ops.reshape(attention_states, - [-1, attn_length, 1, attn_size]) - hidden_features = [] - v = [] - attention_vec_size = attn_size # Size of query vectors for attention. - for a in xrange(num_heads): - k = variable_scope.get_variable("AttnW_%d" % a, - [1, 1, attn_size, attention_vec_size]) - hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) - v.append( - variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) - - state = initial_state - - def attention(query): - """Put attention masks on hidden using hidden_features and query.""" - ds = [] # Results of attention reads will be stored here. - if nest.is_sequence(query): # If the query is a tuple, flatten it. - query_list = nest.flatten(query) - for q in query_list: # Check that ndims == 2 if specified. - ndims = q.get_shape().ndims - if ndims: - assert ndims == 2 - query = array_ops.concat(query_list, 1) - for a in xrange(num_heads): - with variable_scope.variable_scope("Attention_%d" % a): - y = linear(query, attention_vec_size, True) - y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) - # Attention mask is a softmax of v^T * tanh(...). - s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), - [2, 3]) - a = nn_ops.softmax(s) - # Now calculate the attention-weighted vector d. - d = math_ops.reduce_sum( - array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) - ds.append(array_ops.reshape(d, [-1, attn_size])) - return ds - - outputs = [] - prev = None - batch_attn_size = array_ops.stack([batch_size, attn_size]) - attns = [ - array_ops.zeros( - batch_attn_size, dtype=dtype) for _ in xrange(num_heads) - ] - for a in attns: # Ensure the second shape of attention vectors is set. - a.set_shape([None, attn_size]) - if initial_state_attention: - attns = attention(initial_state) - for i, inp in enumerate(decoder_inputs): - if i > 0: - variable_scope.get_variable_scope().reuse_variables() - # If loop_function is set, we use it instead of decoder_inputs. - if loop_function is not None and prev is not None: - with variable_scope.variable_scope("loop_function", reuse=True): - inp = loop_function(prev, i) - # Merge input and previous attentions into one vector of the right size. - input_size = inp.get_shape().with_rank(2)[1] - if input_size.value is None: - raise ValueError("Could not infer input size from input: %s" % inp.name) - x = linear([inp] + attns, input_size, True) - # Run the RNN. - cell_output, state = cell(x, state) - # Run the attention mechanism. - if i == 0 and initial_state_attention: - with variable_scope.variable_scope( - variable_scope.get_variable_scope(), reuse=True): - attns = attention(state) - else: - attns = attention(state) - - with variable_scope.variable_scope("AttnOutputProjection"): - output = linear([cell_output] + attns, output_size, True) - if loop_function is not None: - prev = output - outputs.append(output) - - return outputs, state diff --git a/tf_chatbot/lib/chat.py b/tf_chatbot/lib/chat.py index 094737e..18a5c65 100644 --- a/tf_chatbot/lib/chat.py +++ b/tf_chatbot/lib/chat.py @@ -7,13 +7,17 @@ from tf_chatbot.lib import data_utils from tf_chatbot.lib.seq2seq_model_utils import create_model, get_predicted_sentence + def chat(): with tf.Session() as sess: model = create_model(sess, forward_only=True) model.batch_size = 1 - vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) + vocab_path = os.path.join( + FLAGS.data_dir, + "vocab%d.in" % + FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) sys.stdout.write("> ") @@ -21,8 +25,9 @@ def chat(): sentence = sys.stdin.readline() while sentence: - predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess) + predicted_sentence = get_predicted_sentence( + sentence, vocab, rev_vocab, model, sess) print(predicted_sentence) print("> ") sys.stdout.flush() - sentence = sys.stdin.readline() \ No newline at end of file + sentence = sys.stdin.readline() diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py index cb976ee..e823e33 100644 --- a/tf_chatbot/lib/data_utils.py +++ b/tf_chatbot/lib/data_utils.py @@ -28,22 +28,28 @@ _ENCODING = "utf8" + def get_dialog_train_set_path(path): return os.path.join(path, 'train_data') + def get_dialog_dev_set_path(path): return os.path.join(path, 'dev_data') + def basic_tokenizer(sentence): words = [] for space_separated_fragment in sentence.strip().split(): words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) return [w.lower() for w in words if w] + def create_vocabulary_bak(vocabulary_path, data_path, max_vocabulary_size, - tokenizer=None, normalize_digits=True): + tokenizer=None, normalize_digits=True): if not gfile.Exists(vocabulary_path): - print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) + print( + "Creating vocabulary %s from data %s" % + (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode='r') as f: counter = 0 @@ -51,28 +57,33 @@ def create_vocabulary_bak(vocabulary_path, data_path, max_vocabulary_size, counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) - tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) + tokens = tokenizer( + line) if tokenizer else basic_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 - vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) + vocab_list = _START_VOCAB + \ + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode='w') as vocab_file: for w in vocab_list: vocab_file.write(w + '\n') + def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, - tokenizer=None, normalize_digits=True): + tokenizer=None, normalize_digits=True): if not gfile.Exists(vocabulary_path): - print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) + print( + "Creating vocabulary %s from data %s" % + (vocabulary_path, data_path)) vocab = {} data = json.load(open(data_path), encoding=_ENCODING) counter = 0 - for ((q,qe),(a,ae)) in data: + for ((q, qe), (a, ae)) in data: counter += 1 if counter % 50000 == 0: print(" Create_vocabulary: processing line %d" % counter) @@ -110,11 +121,13 @@ def initialize_vocabulary(vocabulary_path): rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] - vocab = dict([(x,y) for (y,x) in enumerate(rev_vocab)]) # {'word':index} + vocab = dict([(x, y) + for (y, x) in enumerate(rev_vocab)]) # {'word':index} return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found" % vocabulary_path) + def sentence_to_token_ids(sentence, vocabulary, tokenizer=None, normalize_digits=True): if tokenizer: @@ -123,10 +136,17 @@ def sentence_to_token_ids(sentence, vocabulary, words = basic_tokenizer(sentence) if not normalize_digits: return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words] - return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words] + return [ + vocabulary.get( + re.sub( + _DIGIT_RE, + "0", + w.encode('utf8')), + UNK_ID) for w in words] + def data_to_token_ids_bak(data_path, target_path, vocabulary_path, - tokenizer=None, normalize_digits=True): + tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) @@ -139,31 +159,38 @@ def data_to_token_ids_bak(data_path, target_path, vocabulary_path, print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) - tokens_file.write(" ".join([str(tok) for tok in token_ids]) + '\n') + tokens_file.write( + " ".join([str(tok) for tok in token_ids]) + '\n') + def data_to_token_ids(data_path, target_path, vocabulary_path, - tokenizer=None, normalize_digits=True): + tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(target_path, mode='w') as tokens_file: data = json.load(open(data_path), encoding=_ENCODING) counter = 0 - for ((q,qe),(a,ae)) in data: + for ((q, qe), (a, ae)) in data: counter += 1 if counter % 50000 == 0: print(" Data_to_token_ids: tokenizing line %d" % counter) - token_ids_q = sentence_to_token_ids(q, vocab, tokenizer, normalize_digits) - tokens_file.write(" ".join([str(tok) for tok in token_ids_q]) + '\n') - token_ids_a = sentence_to_token_ids(a, vocab, tokenizer, normalize_digits) - tokens_file.write(" ".join([str(tok) for tok in token_ids_a]) + '\n') + token_ids_q = sentence_to_token_ids( + q, vocab, tokenizer, normalize_digits) + tokens_file.write(" ".join([str(tok) + for tok in token_ids_q]) + '\n') + token_ids_a = sentence_to_token_ids( + a, vocab, tokenizer, normalize_digits) + tokens_file.write(" ".join([str(tok) + for tok in token_ids_a]) + '\n') + def prepare_dialog_data(data_dir, vocabulary_size): train_path = get_dialog_train_set_path(data_dir) dev_path = get_dialog_dev_set_path(data_dir) vocab_path = os.path.join(data_dir, "vocab%d.in" % vocabulary_size) - create_vocabulary(vocab_path, train_path+".json", vocabulary_size) + create_vocabulary(vocab_path, train_path + ".json", vocabulary_size) train_ids_path = train_path + (".ids%d.in" % vocabulary_size) data_to_token_ids(train_path + ".json", train_ids_path, vocab_path) @@ -173,6 +200,7 @@ def prepare_dialog_data(data_dir, vocabulary_size): return (train_ids_path, dev_ids_path, vocab_path) + def read_data(tokenized_dialog_path, max_size=None): data_set = [[] for _ in BUCKETS] @@ -184,16 +212,16 @@ def read_data(tokenized_dialog_path, max_size=None): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) - #sys.stdout.flush() + # sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(EOS_ID) for bucket_id, (source_size, target_size) in enumerate(BUCKETS): - if len(source_ids) < source_size and len(target_ids) < target_size: + if len(source_ids) < source_size and len( + target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = fh.readline(), fh.readline() return data_set - diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index 00b23f7..6f61c99 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -7,13 +7,15 @@ from tf_chatbot.lib.seq2seq_model_utils import create_model, get_predicted_sentence import json + def predict(): def _get_test_dataset(): data = json.load(open(TEST_DATASET_PATH)) test_sentences = [q for ((q, qe), _) in data] return test_sentences - results_filename = '_'.join(['results', str(FLAGS.num_layers), str(FLAGS.size), str(FLAGS.vocab_size)]) + results_filename = '_'.join( + ['results', str(FLAGS.num_layers), str(FLAGS.size), str(FLAGS.vocab_size)]) results_path = os.path.join(FLAGS.results_dir, results_filename) with tf.Session() as sess, open(results_path, 'w') as results_fh: @@ -21,14 +23,18 @@ def _get_test_dataset(): model = create_model(sess, forward_only=True) model.batch_size = 1 - vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) + vocab_path = os.path.join( + FLAGS.data_dir, + "vocab%d.in" % + FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) test_dataset = _get_test_dataset() for sentence in test_dataset: - predicted_sentence = get_predicted_sentence(sentence, vocab, rev_vocab, model, sess, use_beam_search=True) + predicted_sentence = get_predicted_sentence( + sentence, vocab, rev_vocab, model, sess, use_beam_search=True) print(sentence, '->') print(predicted_sentence) - results_fh.write(predicted_sentence + '\n') \ No newline at end of file + results_fh.write(predicted_sentence + '\n') diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index 61ecde3..dce3fbc 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -5,29 +5,74 @@ import random import numpy as np -# from six.moves import xrange # pylint: disable=redefined-builtin +from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf import tf_chatbot.lib.data_utils as data_utils -from tensorflow.contrib.legacy_seq2seq import model_with_buckets, sequence_loss, embedding_attention_decoder -from tf_chatbot.lib.basic.advanced_seq2seq import embedding_attention_sampled_seq2seq +from tensorflow.contrib.legacy_seq2seq import sequence_loss, attention_decoder from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper -class Seq2SeqModel(object): - """Sequence-to-sequence model with attention and for multiple buckets. - This class implements a multi-layer recurrent neural network as encoder, - and an attention-based decoder. This is the same as the model described in - this paper: http://arxiv.org/abs/1412.7449 - please look there for details, - or into the seq2seq library for complete model implementation. - This class also allows to use GRU cells in addition to LSTM cells, and - sampled softmax to handle large output vocabulary size. A single-layer - version of this model, but with bi-directional encoder, was presented in - http://arxiv.org/abs/1409.0473 - and sampled softmax is described in Section 3 of the following paper. - http://arxiv.org/abs/1412.2007 - """ +def _extract_sample_and_embed(embedding, + output_projection=None, + update_embedding=True): + + def loop_function(prev, _): + if output_projection is not None: + prev = tf.nn.xw_plus_b( + prev, output_projection[0], output_projection[1]) + prev_symbol = tf.squeeze(tf.multinomial(prev, 1), axis=1) + emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol) + if not update_embedding: + emb_prev = tf.stop_gradient(emb_prev) + return emb_prev + + return loop_function + + +def embedding_attention_decoder(decoder_inputs, + initial_state, + attention_states, + cell, + num_symbols, + embedding_size, + num_heads=1, + output_size=None, + output_projection=None, + feed_previous=False, + update_embedding_for_previous=True, + dtype=None, + scope=None, + initial_state_attention=False): + if output_size is None: + output_size = cell.output_size + if output_projection is not None: + proj_biases = tf.convert_to_tensor(output_projection[1], dtype=dtype) + proj_biases.get_shape().assert_is_compatible_with([num_symbols]) + + with tf.variable_scope("embedding_attention_decoder", dtype=dtype): + embedding = tf.get_variable("embedding", [num_symbols, embedding_size]) + + loop_function = _extract_sample_and_embed( + embedding, output_projection, + update_embedding_for_previous) if feed_previous else None + emb_inp = [ + tf.nn.embedding_lookup( + embedding, + i) for i in decoder_inputs] + + return attention_decoder( + emb_inp, + initial_state, + attention_states, + cell, + output_size=output_size, + num_heads=num_heads, + loop_function=loop_function, + initial_state_attention=initial_state_attention) + +class Seq2SeqModel(object): def __init__(self, source_vocab_size, target_vocab_size, @@ -43,28 +88,7 @@ def __init__(self, forward_only=False, beam_search_size=1, dtype=tf.float32): - """Create the model. - Args: - source_vocab_size: size of the source vocabulary. - target_vocab_size: size of the target vocabulary. - buckets: a list of pairs (I, O), where I specifies maximum input length - that will be processed in that bucket, and O specifies maximum output - length. Training instances that have inputs longer than I or outputs - longer than O will be pushed to the next bucket and padded accordingly. - We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. - size: number of units in each layer of the model. - num_layers: number of layers in the model. - max_gradient_norm: gradients will be clipped to maximally this norm. - batch_size: the size of the batches used during training; - the model construction is independent of batch_size, so it can be - changed after initialization if this is convenient, e.g., for decoding. - learning_rate: learning rate to start with. - learning_rate_decay_factor: decay learning rate by this much when needed. - use_lstm: if true, we use LSTM cells instead of GRU cells. - num_samples: number of samples for sampled softmax. - forward_only: if set, we do not construct the backward pass in the model. - dtype: the data type to use to store internal variables. - """ + self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets @@ -79,11 +103,16 @@ def __init__(self, # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None - # Sampled softmax only makes sense if we sample less than vocabulary size. + # Sampled softmax only makes sense if we sample less than vocabulary + # size. if num_samples > 0 and num_samples < self.target_vocab_size: - w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) + w_t = tf.get_variable( + "proj_w", [ + self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) - b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) + b = tf.get_variable( + "proj_b", [ + self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, logits): @@ -124,18 +153,19 @@ def single_cell(): # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode, bucket_id): - def embedding_attention_sampled_seq2seq(encoder_inputs, - decoder_inputs, - cell, - num_encoder_symbols, - num_decoder_symbols, - embedding_size, - bucket_index, - num_heads=1, - output_projection=None, - feed_previous=False, - initial_state_attention=False, - dtype=tf.float32): + def embedding_attention_sampled_seq2seq( + encoder_inputs, + decoder_inputs, + cell, + num_encoder_symbols, + num_decoder_symbols, + embedding_size, + bucket_index, + num_heads=1, + output_projection=None, + feed_previous=False, + initial_state_attention=False, + dtype=tf.float32): with tf.variable_scope("embedding_attention_sampled_seq2seq"): encoder_cell = EmbeddingWrapper( cell, @@ -145,9 +175,8 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, encoder_outputs, encoder_state = static_rnn( encoder_cell, encoder_inputs, dtype=dtype) - top_states = [ - tf.reshape(e, [-1,1, cell.output_size]) for e in encoder_outputs - ] + top_states = [tf.reshape( + e, [-1, 1, cell.output_size]) for e in encoder_outputs] attention_states = tf.concat(top_states, 1) self.model_encoder_states[bucket_index] = encoder_state @@ -155,14 +184,17 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, output_size = None if output_projection is None: - cell = OutputProjectionWrapper(cell, num_decoder_symbols) + cell = OutputProjectionWrapper( + cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): - return embedding_attention_decoder( + return embedding_attention_decoder( decoder_inputs, - self.model_encoder_states[bucket_index], #encoder_state, - self.model_attention_states[bucket_index], #attention_states, + # encoder_state, + self.model_encoder_states[bucket_index], + # attention_states, + self.model_attention_states[bucket_index], cell, num_decoder_symbols, embedding_size, @@ -191,13 +223,22 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, self.decoder_inputs = [] self.target_weights = [] for i in range(buckets[-1][0]): # Last bucket is the biggest one. - self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], - name="encoder{0}".format(i))) + self.encoder_inputs.append( + tf.placeholder( + tf.int32, + shape=[None], + name="encoder{0}".format(i))) for i in range(buckets[-1][1] + 1): - self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], - name="decoder{0}".format(i))) - self.target_weights.append(tf.placeholder(dtype, shape=[None], - name="weight{0}".format(i))) + self.decoder_inputs.append( + tf.placeholder( + tf.int32, + shape=[None], + name="decoder{0}".format(i))) + self.target_weights.append( + tf.placeholder( + dtype, + shape=[None], + name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] @@ -211,14 +252,14 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, with tf.variable_scope(tf.get_variable_scope(), reuse=True if bucket_idx > 0 else None): if forward_only: bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]], - self.decoder_inputs[:bucket[1]], - True, - bucket_idx) + self.decoder_inputs[:bucket[1]], + True, + bucket_idx) else: bucket_outputs, bucket_outputs_state = seq2seq_f(self.encoder_inputs[:bucket[0]], - self.decoder_inputs[:bucket[1]], - False, - bucket_idx) + self.decoder_inputs[:bucket[1]], + False, + bucket_idx) self.outputs.append(bucket_outputs) self.decoder_out_state.append(bucket_outputs_state) self.losses.append( @@ -237,7 +278,8 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, for output in self.outputs[b]] #best_outputs = [tf.argmax(x,1) for x in self.outputs[b]] #best_outputs = tf.concat(axis=1, values=[tf.reshape(x, [self.batch_size, 1]) for x in best_outputs]) - _topk_log_probs, _topk_ids = tf.nn.top_k(tf.nn.softmax(self.outputs[b][-1]), beam_search_size) + _topk_log_probs, _topk_ids = tf.nn.top_k( + tf.nn.softmax(self.outputs[b][-1]), beam_search_size) self.topk_probs.append(_topk_log_probs) self.topk_ids.append(_topk_ids) @@ -249,26 +291,39 @@ def embedding_attention_sampled_seq2seq(encoder_inputs, opt = tf.train.GradientDescentOptimizer(self.learning_rate) for b in range(len(buckets)): gradients = tf.gradients(self.losses[b], params) - clipped_gradients, norm = tf.clip_by_global_norm(gradients, - max_gradient_norm) + clipped_gradients, norm = tf.clip_by_global_norm( + gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables()) - def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weights, - bucket_id, forward_only, use_beam_search=True): + def step( + self, + session, + encoder_inputs, + decoder_inputs, + target_weights, + bucket_id, + forward_only, + use_beam_search=False): encoder_size, decoder_size = self.buckets[bucket_id] if len(encoder_inputs) != encoder_size: - raise ValueError("Encoder length must be equal to the one in bucket," - " %d != %d." % (len(encoder_inputs), encoder_size)) + raise ValueError( + "Encoder length must be equal to the one in bucket," + " %d != %d." % + (len(encoder_inputs), encoder_size)) if len(decoder_inputs) != decoder_size: - raise ValueError("Decoder length must be equal to the one in bucket," - " %d != %d." % (len(decoder_inputs), decoder_size)) + raise ValueError( + "Decoder length must be equal to the one in bucket," + " %d != %d." % + (len(decoder_inputs), decoder_size)) if len(target_weights) != decoder_size: - raise ValueError("Weights length must be equal to the one in bucket," - " %d != %d." % (len(target_weights), decoder_size)) + raise ValueError( + "Weights length must be equal to the one in bucket," + " %d != %d." % + (len(target_weights), decoder_size)) input_feed = {} for l in range(encoder_size): @@ -281,42 +336,65 @@ def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weigh input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) if not forward_only: - raise NotImplementedError("Not Implemented!!!") + output_feed = [self.updates[bucket_id], # Update Op that does SGD. + self.gradient_norms[bucket_id], # Gradient norm. + self.losses[bucket_id]] # Loss for this batch. + outputs = session.run(output_feed, input_feed) + + # Gradient norm, loss, no outputs. + return outputs[1], outputs[2], None else: if use_beam_search: output_feed = [self.model_attention_states[bucket_id], self.model_encoder_states[bucket_id]] - outputs = session.run(output_feed, input_feed) # attention_state, encoder_state - beams = [(0.0, [data_utils.GO_ID], data_utils.GO_ID, outputs[1])] * 3 # score, result, last_token, encoder_state + # attention_state, encoder_state + outputs = session.run(output_feed, input_feed) + # score, result, last_token, encoder_state + beams = [(0.0, + [data_utils.GO_ID], + data_utils.GO_ID, + outputs[1])] * 3 result = [] step = 0 attention_state = outputs[0] - while step < decoder_size and len(result) < self.beam_search_size: + while step < decoder_size and len( + result) < self.beam_search_size: step += 1 _last_tokens = [beam_[2] for beam_ in beams] _encoder_state = [beam_[3] for beam_ in beams] - output_feed = [self.topk_ids[bucket_id], self.topk_probs[bucket_id], self.decoder_out_state[bucket_id]] + output_feed = [ + self.topk_ids[bucket_id], + self.topk_probs[bucket_id], + self.decoder_out_state[bucket_id]] input_feed = {} input_feed[self.model_attention_states[bucket_id].name] = attention_state - input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state)) + input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze( + np.array(_encoder_state)) for l in range(step): _decoder_inputs = [beam_[1][l] for beam_ in beams] input_feed[self.decoder_inputs[l].name] = _decoder_inputs - _tok_ids, _tok_probs, _out_states = session.run(output_feed, input_feed) + _tok_ids, _tok_probs, _out_states = session.run( + output_feed, input_feed) new_beams = [] for beam_idx in range(self.beam_search_size): for _idx in range(self.beam_search_size): - new_beams.append((beams[beam_idx][0]+_tok_probs[beam_idx][_idx], beams[beam_idx][1]+[_tok_ids[beam_idx][_idx]], _tok_ids[beam_idx][_idx], _out_states[beam_idx])) + new_beams.append( + (beams[beam_idx][0] + _tok_probs[beam_idx][_idx], + beams[beam_idx][1] + [ + _tok_ids[beam_idx][_idx]], + _tok_ids[beam_idx][_idx], + _out_states[beam_idx])) new_beams.sort(key=lambda x: x[0], reverse=True) beams = [] for beam_ in new_beams: if beam_[2] == data_utils.EOS_ID: - result.append((beam_[0],beam_[1][:-1],beam_[2],beam_[3])) + result.append( + (beam_[0], beam_[1][:-1], beam_[2], beam_[3])) else: beams.append(beam_) if len(beams) == self.beam_search_size: @@ -327,50 +405,17 @@ def step_beam_search(self, session, encoder_inputs, decoder_inputs, target_weigh result.append(beam_) if len(result) == self.beam_search_size: break - return None, None, result[0][1] - - else: - raise NotImplementedError("Not Implemented!!!") - - - def step(self, session, encoder_inputs, decoder_inputs, target_weights, - bucket_id, forward_only): - encoder_size, decoder_size = self.buckets[bucket_id] - if len(encoder_inputs) != encoder_size: - raise ValueError("Encoder length must be equal to the one in bucket," - " %d != %d." % (len(encoder_inputs), encoder_size)) - if len(decoder_inputs) != decoder_size: - raise ValueError("Decoder length must be equal to the one in bucket," - " %d != %d." % (len(decoder_inputs), decoder_size)) - if len(target_weights) != decoder_size: - raise ValueError("Weights length must be equal to the one in bucket," - " %d != %d." % (len(target_weights), decoder_size)) - - input_feed = {} - for l in range(encoder_size): - input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] - for l in range(decoder_size): - input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] - input_feed[self.target_weights[l].name] = target_weights[l] - last_target = self.decoder_inputs[decoder_size].name - input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) + outputs = result[0] + return None, None, outputs[1] - if not forward_only: - output_feed = [self.updates[bucket_id], # Update Op that does SGD. - self.gradient_norms[bucket_id], # Gradient norm. - self.losses[bucket_id]] # Loss for this batch. - else: - output_feed = [self.losses[bucket_id]] # Loss for this batch. - for l in range(decoder_size): # Output logits. - output_feed.append(self.outputs[bucket_id][l]) - - outputs = session.run(output_feed, input_feed) + else: + output_feed = [self.losses[bucket_id]] # Loss for this batch. + for l in range(decoder_size): # Output logits. + output_feed.append(self.outputs[bucket_id][l]) - if not forward_only: - return outputs[1], outputs[2], None # Gradient norm, loss, no outputs. - else: - return None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. + outputs = session.run(output_feed, input_feed) + return None, outputs[0], outputs[1:] def get_batch(self, data, bucket_id): @@ -380,7 +425,8 @@ def get_batch(self, data, bucket_id): for _ in range(self.batch_size): encoder_input, decoder_input = random.choice(data[bucket_id]) - encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) + encoder_pad = [data_utils.PAD_ID] * \ + (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) decoder_pad_size = decoder_size - len(decoder_input) - 1 diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 79e8659..10f9bb8 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -12,6 +12,7 @@ _INDEX = ".index" + def create_model(session, forward_only): model = seq2seq_model.Seq2SeqModel( source_vocab_size=FLAGS.vocab_size, @@ -33,27 +34,41 @@ def create_model(session, forward_only): model.saver.restore(session, ckpt.model_checkpoint_path) else: if ckpt: - print("Unable to reach checkpoint file %s." % ckpt.model_checkpoint_path) + print( + "Unable to reach checkpoint file %s." % + ckpt.model_checkpoint_path) print("Create model with fresh parameters") session.run(tf.global_variables_initializer()) return model -def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_beam_search=False): + +def get_predicted_sentence( + input_sentence, + vocab, + rev_vocab, + model, + sess, + use_beam_search=False): input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) - bucket_id = min([b for b in range(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)]) + bucket_id = min([b for b in range(len(BUCKETS)) + if BUCKETS[b][0] > len(input_token_ids)]) outputs = [] feed_data = {bucket_id: [(input_token_ids, outputs)]} - encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id) + encoder_inputs, decoder_inputs, target_weights = model.get_batch( + feed_data, bucket_id) if use_beam_search: - _, _, output_words = model.step_beam_search(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) + _, _, output_words = model.step( + sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True) outputs = output_words[1:] - output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs]) + output_sentence = ' '.join([rev_vocab[token_id] + for token_id in outputs]) else: - _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) + _, _, output_logits = model.step( + sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) outputs = [] for logit in output_logits: selected_token_id = int(np.argmax(logit, axis=1)) @@ -63,4 +78,4 @@ def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess, use_be outputs.append(selected_token_id) output_sentence = ' '.join([rev_vocab[output] for output in outputs]) - return output_sentence \ No newline at end of file + return output_sentence diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index d99c913..ab14f7f 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -11,21 +11,27 @@ from tf_chatbot.lib.data_utils import read_data from tf_chatbot.lib import data_utils + def train(): print("Preparing dialog data in %s" % FLAGS.data_dir) - train_data, dev_data, _ = data_utils.prepare_dialog_data(FLAGS.data_dir, FLAGS.vocab_size) + train_data, dev_data, _ = data_utils.prepare_dialog_data( + FLAGS.data_dir, FLAGS.vocab_size) with tf.Session() as sess: - print ("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) + print ( + "Creating %d layers of %d units." % + (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, forward_only=False) - print ("Reading development and training data (limit:%d)." % FLAGS.max_train_data_size) + print ( + "Reading development and training data (limit:%d)." % + FLAGS.max_train_data_size) dev_set = read_data(dev_data) train_set = read_data(train_data, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in range(len(BUCKETS))] train_total_size = float(sum(train_bucket_sizes)) - train_buckets_scale = [sum(train_bucket_sizes[:i+1]) / train_total_size + train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes))] step_time, loss = 0.0, 0.0 @@ -44,29 +50,40 @@ def train(): _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False) - step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint + step_time += (time.time() - start_time) / \ + FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp(loss) if loss < 300 else float('inf') - print ("global step %d learning rate %.4f step-time %.2f perplexity %.2f" % - (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) + print ( + "global step %d learning rate %.4f step-time %.2f perplexity %.2f" % + (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) - if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): + if len(previous_losses) > 2 and loss > max( + previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) checkpoint_path = os.path.join(FLAGS.model_dir, 'model.ckpt') - model.saver.save(sess, checkpoint_path, global_step=model.global_step) + model.saver.save( + sess, + checkpoint_path, + global_step=model.global_step) step_time, loss = 0.0, 0.0 for bucket_id in range(len(BUCKETS)): - encoder_inputs, decoder_inputs, target_weights = model.get_batch(dev_set, bucket_id) - _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) - - eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') - print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) - - sys.stdout.flush() \ No newline at end of file + encoder_inputs, decoder_inputs, target_weights = model.get_batch( + dev_set, bucket_id) + _, eval_loss, _ = model.step( + sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) + + eval_ppx = math.exp( + eval_loss) if eval_loss < 300 else float('inf') + print( + " eval: bucket %d perplexity %.2f" % + (bucket_id, eval_ppx)) + + sys.stdout.flush() From 25d6c386822d45f5684299d6b4eb1ad0cedf565a Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Fri, 12 May 2017 16:05:32 +0800 Subject: [PATCH 03/14] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tf_chatbot/configs/config.py | 2 +- tf_chatbot/lib/train.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py index 918ef9a..4f136c7 100644 --- a/tf_chatbot/configs/config.py +++ b/tf_chatbot/configs/config.py @@ -12,7 +12,7 @@ tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm') tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training') -tf.app.flags.DEFINE_integer('vocab_size', 1000, 'Dialog vocabulary size') +tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size') tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer') tf.app.flags.DEFINE_integer('num_layers', 1, 'Numbers of layers in the model') tf.app.flags.DEFINE_integer('beam_search_size', 3, 'Size of beam search op') diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index ab14f7f..df6b1db 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -50,8 +50,7 @@ def train(): _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=False) - step_time += (time.time() - start_time) / \ - FLAGS.steps_per_checkpoint + step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 From 75bce23f30b9f7a4e8519a203ec66966017caeda Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Fri, 12 May 2017 16:42:34 +0800 Subject: [PATCH 04/14] =?UTF-8?q?=E9=92=88=E5=AF=B9Windows=E7=B3=BB?= =?UTF-8?q?=E7=BB=9F=E4=BF=AE=E6=94=B9=E9=83=A8=E5=88=86=E8=AF=BB=E5=86=99?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- legacy_models/main.py | 35 +++++++++++++++++++++++++++++++++++ tf_chatbot/lib/data_utils.py | 19 ++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) create mode 100644 legacy_models/main.py diff --git a/legacy_models/main.py b/legacy_models/main.py new file mode 100644 index 0000000..09f7355 --- /dev/null +++ b/legacy_models/main.py @@ -0,0 +1,35 @@ +# -*- coding:utf8 -*- +import tensorflow as tf + +from legacy_models.model.lstm.model import Config, Model + +# 对训练数据进行切割 +# splitData() + + +config = Config() +#config.is_pretrained = False +model = Model(config) +sess = tf.Session() +model.variables_init(sess) +model.restore(sess, 24000) +model.train(sess) +model.loss_tracker.savefig(config.save_path) + +resonse = model.generate(sess, "我 对此 感到 非常 开心") +print(resonse) + +''' +vocab_to_idx, idx_to_vocab, vocab_embed = loadPretrainedVector(30, 50, "./dict/vector/wiki.zh.text200.vector") + +for k in vocab_to_idx.keys(): + if u"他"==k: + print(k, vocab_to_idx[k]) + +''' + +#for k in idx_to_vocab.keys(): +# print(k, idx_to_vocab[k]) + +#for i in vocab_embed: +# print(i) diff --git a/tf_chatbot/lib/data_utils.py b/tf_chatbot/lib/data_utils.py index e823e33..038c1fc 100644 --- a/tf_chatbot/lib/data_utils.py +++ b/tf_chatbot/lib/data_utils.py @@ -5,6 +5,7 @@ import os import re import sys +import platform import json from tensorflow.python.platform import gfile @@ -134,15 +135,15 @@ def sentence_to_token_ids(sentence, vocabulary, words = tokenizer(sentence) else: words = basic_tokenizer(sentence) - if not normalize_digits: - return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words] - return [ - vocabulary.get( - re.sub( - _DIGIT_RE, - "0", - w.encode('utf8')), - UNK_ID) for w in words] + if platform.system() == "Windows": + if not normalize_digits: + return [vocabulary.get(w, UNK_ID) for w in words] + return [vocabulary.get(re.sub(_DIGIT_RE, "0", w), UNK_ID) for w in words] + + else: + if not normalize_digits: + return [vocabulary.get(w.encode('utf8'), UNK_ID) for w in words] + return [vocabulary.get(re.sub(_DIGIT_RE, "0", w.encode('utf8')), UNK_ID) for w in words] def data_to_token_ids_bak(data_path, target_path, vocabulary_path, From b6af9ce229e5a8f12bc9c432d70d69482c6e137e Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Sat, 13 May 2017 01:48:05 +0800 Subject: [PATCH 05/14] Fix bugs --- tf_chatbot/lib/predict.py | 2 +- tf_chatbot/lib/seq2seq_model.py | 34 +++++++++++++++++++++++---- tf_chatbot/lib/seq2seq_model_utils.py | 5 +++- tf_chatbot/lib/train.py | 2 +- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index 6f61c99..bfe8a77 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -34,7 +34,7 @@ def _get_test_dataset(): for sentence in test_dataset: predicted_sentence = get_predicted_sentence( sentence, vocab, rev_vocab, model, sess, use_beam_search=True) - print(sentence, '->') + print(sentence.strip(), '->') print(predicted_sentence) results_fh.write(predicted_sentence + '\n') diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index dce3fbc..3a372a6 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -13,6 +13,20 @@ from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper +def _extract_argmax_and_embed(embedding, + output_projection=None, + update_embedding=True): + def loop_function(prev, _): + if output_projection is not None: + prev = tf.nn.xw_plus_b(prev, output_projection[0], output_projection[1]) + prev_symbol = tf.argmax(prev, 1) + emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol) + if not update_embedding: + emb_prev = tf.stop_gradient(emb_prev) + return emb_prev + + return loop_function + def _extract_sample_and_embed(embedding, output_projection=None, update_embedding=True): @@ -42,7 +56,7 @@ def embedding_attention_decoder(decoder_inputs, feed_previous=False, update_embedding_for_previous=True, dtype=None, - scope=None, + use_sample=True, initial_state_attention=False): if output_size is None: output_size = cell.output_size @@ -53,9 +67,17 @@ def embedding_attention_decoder(decoder_inputs, with tf.variable_scope("embedding_attention_decoder", dtype=dtype): embedding = tf.get_variable("embedding", [num_symbols, embedding_size]) - loop_function = _extract_sample_and_embed( - embedding, output_projection, - update_embedding_for_previous) if feed_previous else None + if feed_previous: + if use_sample: + loop_function = _extract_sample_and_embed( + embedding, output_projection, + update_embedding_for_previous) + else: + loop_function = _extract_argmax_and_embed( + embedding, output_projection, + update_embedding_for_previous) + else: + loop_function = None emb_inp = [ tf.nn.embedding_lookup( embedding, @@ -85,6 +107,7 @@ def __init__(self, learning_rate_decay_factor, use_lstm=False, num_samples=512, + use_sample=True, forward_only=False, beam_search_size=1, dtype=tf.float32): @@ -99,6 +122,7 @@ def __init__(self, self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.beam_search_size = beam_search_size + self.use_sample = use_sample # If we use sampled softmax, we need an output projection. output_projection = None @@ -392,7 +416,7 @@ def step( new_beams.sort(key=lambda x: x[0], reverse=True) beams = [] for beam_ in new_beams: - if beam_[2] == data_utils.EOS_ID: + if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2: result.append( (beam_[0], beam_[1][:-1], beam_[2], beam_[3])) else: diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 10f9bb8..38766f8 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -13,7 +13,7 @@ _INDEX = ".index" -def create_model(session, forward_only): +def create_model(session, forward_only, use_sample=True): model = seq2seq_model.Seq2SeqModel( source_vocab_size=FLAGS.vocab_size, target_vocab_size=FLAGS.vocab_size, @@ -25,6 +25,7 @@ def create_model(session, forward_only): learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, use_lstm=False, + use_sample=use_sample, beam_search_size=FLAGS.beam_search_size, forward_only=forward_only) @@ -62,6 +63,8 @@ def get_predicted_sentence( if use_beam_search: _, _, output_words = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True) + print("HERE!!!!!") + print(output_words) outputs = output_words[1:] output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs]) diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index df6b1db..0ecd2f2 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -21,7 +21,7 @@ def train(): print ( "Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) - model = create_model(sess, forward_only=False) + model = create_model(sess, forward_only=False, ) print ( "Reading development and training data (limit:%d)." % From 66e55cc25d6b18a60dfef51c84f586cce405e76d Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Mon, 15 May 2017 09:25:53 +0800 Subject: [PATCH 06/14] Save changes --- tf_chatbot/lib/predict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index bfe8a77..0f9a51e 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -20,7 +20,7 @@ def _get_test_dataset(): with tf.Session() as sess, open(results_path, 'w') as results_fh: - model = create_model(sess, forward_only=True) + model = create_model(sess, forward_only=True, use_sample=False) model.batch_size = 1 vocab_path = os.path.join( @@ -33,7 +33,7 @@ def _get_test_dataset(): for sentence in test_dataset: predicted_sentence = get_predicted_sentence( - sentence, vocab, rev_vocab, model, sess, use_beam_search=True) + sentence, vocab, rev_vocab, model, sess, use_beam_search=False) print(sentence.strip(), '->') print(predicted_sentence) From 4da925b1f54a44b8d080e21966bab8338de705d2 Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Mon, 15 May 2017 10:25:13 +0800 Subject: [PATCH 07/14] Commit --- tf_chatbot/lib/seq2seq_model_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 10f9bb8..d87288c 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -13,7 +13,7 @@ _INDEX = ".index" -def create_model(session, forward_only): +def gicreate_model(session, forward_only): model = seq2seq_model.Seq2SeqModel( source_vocab_size=FLAGS.vocab_size, target_vocab_size=FLAGS.vocab_size, From 9609793d400c2076c9ae6c85ec8960a5b9480394 Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Mon, 15 May 2017 10:36:05 +0800 Subject: [PATCH 08/14] Fix sample bug --- tf_chatbot/lib/seq2seq_model.py | 9 ++++----- tf_chatbot/lib/seq2seq_model_utils.py | 2 +- tf_chatbot/lib/train.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index 3a372a6..f09f60e 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -56,7 +56,7 @@ def embedding_attention_decoder(decoder_inputs, feed_previous=False, update_embedding_for_previous=True, dtype=None, - use_sample=True, + use_sample=False, initial_state_attention=False): if output_size is None: output_size = cell.output_size @@ -79,9 +79,7 @@ def embedding_attention_decoder(decoder_inputs, else: loop_function = None emb_inp = [ - tf.nn.embedding_lookup( - embedding, - i) for i in decoder_inputs] + tf.nn.embedding_lookup(embedding,i) for i in decoder_inputs] return attention_decoder( emb_inp, @@ -107,7 +105,7 @@ def __init__(self, learning_rate_decay_factor, use_lstm=False, num_samples=512, - use_sample=True, + use_sample=False, forward_only=False, beam_search_size=1, dtype=tf.float32): @@ -226,6 +224,7 @@ def embedding_attention_sampled_seq2seq( output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, + use_sample=self.use_sample, initial_state_attention=initial_state_attention) else: diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 38766f8..8db0dd8 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -13,7 +13,7 @@ _INDEX = ".index" -def create_model(session, forward_only, use_sample=True): +def create_model(session, forward_only, use_sample=False): model = seq2seq_model.Seq2SeqModel( source_vocab_size=FLAGS.vocab_size, target_vocab_size=FLAGS.vocab_size, diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index 0ecd2f2..df6b1db 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -21,7 +21,7 @@ def train(): print ( "Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) - model = create_model(sess, forward_only=False, ) + model = create_model(sess, forward_only=False) print ( "Reading development and training data (limit:%d)." % From be659dd58c46d221a1e02d4d28d8bfb5d9466138 Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Mon, 15 May 2017 13:54:18 +0800 Subject: [PATCH 09/14] Commit --- tf_chatbot/configs/config.py | 1 + tf_chatbot/lib/predict.py | 2 +- tf_chatbot/lib/train.py | 5 ++++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py index 4f136c7..12882e7 100644 --- a/tf_chatbot/configs/config.py +++ b/tf_chatbot/configs/config.py @@ -11,6 +11,7 @@ tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.99, 'Learning rate decays by this much.') tf.app.flags.DEFINE_float('max_gradient_norm', 5.0, 'Clip gradients to this norm') tf.app.flags.DEFINE_integer('batch_size', 128, 'Batch size to use during training') +tf.app.flags.DEFINE_integer('epoch_size', 20, 'Size of epoch') tf.app.flags.DEFINE_integer('vocab_size', 20000, 'Dialog vocabulary size') tf.app.flags.DEFINE_integer('size', 128, 'size of each model layer') diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index 0f9a51e..5a89ce6 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -20,7 +20,7 @@ def _get_test_dataset(): with tf.Session() as sess, open(results_path, 'w') as results_fh: - model = create_model(sess, forward_only=True, use_sample=False) + model = create_model(sess, forward_only=True, use_sample=True) model.batch_size = 1 vocab_path = os.path.join( diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index df6b1db..556497f 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -38,7 +38,10 @@ def train(): current_step = 0 previous_losses = [] - while True: + total_epoch = FLAGS.epoch_size + epoch_steps = np.sum([len(ts) for ts in train_set]) / FLAGS.batch_size + 1 + + while _ in range(epoch_steps * total_epoch): random_number_01 = np.random.random_sample() bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) From bae3231021869325635ced79c645a4a9101938ea Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Mon, 15 May 2017 14:01:28 +0800 Subject: [PATCH 10/14] fix bug --- tf_chatbot/lib/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tf_chatbot/lib/train.py b/tf_chatbot/lib/train.py index 556497f..9c46f35 100644 --- a/tf_chatbot/lib/train.py +++ b/tf_chatbot/lib/train.py @@ -41,7 +41,7 @@ def train(): total_epoch = FLAGS.epoch_size epoch_steps = np.sum([len(ts) for ts in train_set]) / FLAGS.batch_size + 1 - while _ in range(epoch_steps * total_epoch): + while model.global_step.eval() < (epoch_steps * total_epoch): random_number_01 = np.random.random_sample() bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) From 676a31bc08da29405bfd1636ac6530024cd5f7cb Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Mon, 15 May 2017 20:01:13 +0800 Subject: [PATCH 11/14] debug beam search --- tf_chatbot/lib/predict.py | 7 ++-- tf_chatbot/lib/seq2seq_model.py | 50 +++++++++++++++++---------- tf_chatbot/lib/seq2seq_model_utils.py | 18 ++++++++-- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index 5a89ce6..c539af1 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -20,7 +20,7 @@ def _get_test_dataset(): with tf.Session() as sess, open(results_path, 'w') as results_fh: - model = create_model(sess, forward_only=True, use_sample=True) + model = create_model(sess, forward_only=True, use_sample=False) model.batch_size = 1 vocab_path = os.path.join( @@ -33,8 +33,11 @@ def _get_test_dataset(): for sentence in test_dataset: predicted_sentence = get_predicted_sentence( - sentence, vocab, rev_vocab, model, sess, use_beam_search=False) + sentence, vocab, rev_vocab, model, sess, use_beam_search=True) print(sentence.strip(), '->') print(predicted_sentence) + # ----------For Debug ---------- + #break + # ----------End Debug ---------- results_fh.write(predicted_sentence + '\n') diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index f09f60e..0b90b91 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -35,7 +35,7 @@ def loop_function(prev, _): if output_projection is not None: prev = tf.nn.xw_plus_b( prev, output_projection[0], output_projection[1]) - prev_symbol = tf.squeeze(tf.multinomial(prev, 1), axis=1) + prev_symbol = tf.squeeze(tf.multinomial(tf.nn.softmax(prev), 1), axis=1) emb_prev = tf.nn.embedding_lookup(embedding, prev_symbol) if not update_embedding: emb_prev = tf.stop_gradient(emb_prev) @@ -373,53 +373,65 @@ def step( # attention_state, encoder_state outputs = session.run(output_feed, input_feed) # score, result, last_token, encoder_state + + temp_encoder_states = outputs[1][0] + beams = [(0.0, [data_utils.GO_ID], data_utils.GO_ID, - outputs[1])] * 3 + outputs[1][0])] * self.beam_search_size result = [] step = 0 attention_state = outputs[0] - while step < decoder_size and len( - result) < self.beam_search_size: + while step < decoder_size and len(result) < self.beam_search_size: step += 1 - _last_tokens = [beam_[2] for beam_ in beams] _encoder_state = [beam_[3] for beam_ in beams] - output_feed = [ - self.topk_ids[bucket_id], - self.topk_probs[bucket_id], - self.decoder_out_state[bucket_id]] + output_feed = self.outputs[bucket_id] + #self.decoder_out_state[bucket_id]] input_feed = {} input_feed[self.model_attention_states[bucket_id].name] = attention_state - input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze( - np.array(_encoder_state)) + input_feed[self.model_encoder_states[bucket_id].name] = np.squeeze(np.array(_encoder_state)) + for l in range(step): - _decoder_inputs = [beam_[1][l] for beam_ in beams] + _decoder_inputs = np.array([beam_[1][l] for beam_ in beams]) input_feed[self.decoder_inputs[l].name] = _decoder_inputs - _tok_ids, _tok_probs, _out_states = session.run( - output_feed, input_feed) + _outputs = session.run(output_feed, input_feed) + + _tok_probs, _tok_ids = [], [] + for _idx in range(self.beam_search_size): + _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size) + _tok_probs.append(_tok_prob.eval()) + _tok_ids.append(_tok_id.eval()) new_beams = [] for beam_idx in range(self.beam_search_size): for _idx in range(self.beam_search_size): + #print("before - ", "score:", beams[beam_idx][0], "strs:", beams[beam_idx][1], "next:", beams[beam_idx][2]) new_beams.append( (beams[beam_idx][0] + _tok_probs[beam_idx][_idx], - beams[beam_idx][1] + [ - _tok_ids[beam_idx][_idx]], - _tok_ids[beam_idx][_idx], - _out_states[beam_idx])) + beams[beam_idx][1] + [_tok_ids[beam_idx][_idx]], + _tok_ids[beam_idx][_idx], + #_out_states[beam_idx])) + temp_encoder_states)) + #print("after - ", "score:", new_beams[-1][0], "strs:", new_beams[-1][1], "next:",new_beams[-1][2]) + #print("=========") new_beams.sort(key=lambda x: x[0], reverse=True) + + unduplicate_set = set() beams = [] for beam_ in new_beams: + #if False: if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2: result.append( (beam_[0], beam_[1][:-1], beam_[2], beam_[3])) else: - beams.append(beam_) + if str(beam_[1]) not in unduplicate_set: + unduplicate_set.add(str(beam_[1])) + beams.append(beam_) if len(beams) == self.beam_search_size: break diff --git a/tf_chatbot/lib/seq2seq_model_utils.py b/tf_chatbot/lib/seq2seq_model_utils.py index 8db0dd8..0544ffd 100644 --- a/tf_chatbot/lib/seq2seq_model_utils.py +++ b/tf_chatbot/lib/seq2seq_model_utils.py @@ -61,10 +61,22 @@ def get_predicted_sentence( feed_data, bucket_id) if use_beam_search: + new_encoder_inputs, new_decoder_inputs, new_target_weights = [],[],[] + for _array in decoder_inputs: + for _item in _array: + _de_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32) + new_decoder_inputs.append(_de_input) + for _array in encoder_inputs: + for _item in _array: + _en_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32) + new_encoder_inputs.append(_en_input) + for _array in target_weights: + for _item in _array: + _ta_input = np.array([_item] * FLAGS.beam_search_size, dtype=np.int32) + new_target_weights.append(_ta_input) + _, _, output_words = model.step( - sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True, use_beam_search=True) - print("HERE!!!!!") - print(output_words) + sess, new_encoder_inputs, new_decoder_inputs, new_target_weights, bucket_id, forward_only=True, use_beam_search=True) outputs = output_words[1:] output_sentence = ' '.join([rev_vocab[token_id] for token_id in outputs]) From 61576e2191b085ab0f6e571d6b3061b69d2744f4 Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Tue, 16 May 2017 09:28:25 +0800 Subject: [PATCH 12/14] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index eb7fcbb..a28edb5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # SMIPG-NLPCC2017 -Emotional Conversation Generation Task in NLPCC2017 +GRU + Attention + Beam Search (+ Sample)的Seq2Seq模型 From f1820520dfbed4cd21fd202d438dfbf842f9560e Mon Sep 17 00:00:00 2001 From: RuiZhang1993 Date: Tue, 16 May 2017 16:55:10 +0800 Subject: [PATCH 13/14] update --- tf_chatbot/lib/seq2seq_model.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index 0b90b91..61160e2 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -384,6 +384,9 @@ def step( step = 0 attention_state = outputs[0] + def numpy_softmax(x): + return np.exp(x) / np.sum(np.exp(x), axis=0) + while step < decoder_size and len(result) < self.beam_search_size: step += 1 _encoder_state = [beam_[3] for beam_ in beams] @@ -400,10 +403,16 @@ def step( _outputs = session.run(output_feed, input_feed) _tok_probs, _tok_ids = [], [] - for _idx in range(self.beam_search_size): - _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size) - _tok_probs.append(_tok_prob.eval()) - _tok_ids.append(_tok_id.eval()) + + if step == 1: + for _idx in range(self.beam_search_size): + _tok_ids.append(np.random.choice(range(self.target_vocab_size), size=self.beam_search_size, replace=False, p=numpy_softmax(_outputs[step-1][_idx]))) + _tok_probs.append(_outputs[step-1][_idx][_tok_ids[_idx]]) + else: + for _idx in range(self.beam_search_size): + _tok_prob, _tok_id = tf.nn.top_k(tf.nn.softmax(_outputs[step-1][_idx]), self.beam_search_size) + _tok_probs.append(_tok_prob.eval()) + _tok_ids.append(_tok_id.eval()) new_beams = [] @@ -425,7 +434,7 @@ def step( beams = [] for beam_ in new_beams: #if False: - if beam_[2] == data_utils.EOS_ID and len(beam_[1]) > 2: + if beam_[2] == data_utils.EOS_ID: result.append( (beam_[0], beam_[1][:-1], beam_[2], beam_[3])) else: From 527c2c1a433b9775525f92f2662272d50bf5014e Mon Sep 17 00:00:00 2001 From: RuiCheung Date: Wed, 17 May 2017 13:47:44 +0800 Subject: [PATCH 14/14] Fix bugs --- tf_chatbot/configs/config.py | 3 +++ tf_chatbot/lib/predict.py | 7 ++----- tf_chatbot/lib/seq2seq_model.py | 28 +++++++++++++++++++++------- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/tf_chatbot/configs/config.py b/tf_chatbot/configs/config.py index 12882e7..79fe901 100644 --- a/tf_chatbot/configs/config.py +++ b/tf_chatbot/configs/config.py @@ -21,6 +21,9 @@ tf.app.flags.DEFINE_integer('max_train_data_size', 0, 'Limit on the size of training data (0: no limit)') tf.app.flags.DEFINE_integer('steps_per_checkpoint', 100, 'How many training steps to do per checkpoint') +tf.app.flags.DEFINE_boolean('use_sample', True, 'use sample while generating') +tf.app.flags.DEFINE_boolean('use_beam_search', True, 'use beam search while generating') + FLAGS = tf.app.flags.FLAGS BUCKETS = [(5,10), (10, 15), (20, 25), (40, 50)] \ No newline at end of file diff --git a/tf_chatbot/lib/predict.py b/tf_chatbot/lib/predict.py index c539af1..e1266f5 100644 --- a/tf_chatbot/lib/predict.py +++ b/tf_chatbot/lib/predict.py @@ -20,7 +20,7 @@ def _get_test_dataset(): with tf.Session() as sess, open(results_path, 'w') as results_fh: - model = create_model(sess, forward_only=True, use_sample=False) + model = create_model(sess, forward_only=True, use_sample=FLAGS.use_sample) model.batch_size = 1 vocab_path = os.path.join( @@ -33,11 +33,8 @@ def _get_test_dataset(): for sentence in test_dataset: predicted_sentence = get_predicted_sentence( - sentence, vocab, rev_vocab, model, sess, use_beam_search=True) + sentence, vocab, rev_vocab, model, sess, use_beam_search=FLAGS.use_beam_search) print(sentence.strip(), '->') print(predicted_sentence) - # ----------For Debug ---------- - #break - # ----------End Debug ---------- results_fh.write(predicted_sentence + '\n') diff --git a/tf_chatbot/lib/seq2seq_model.py b/tf_chatbot/lib/seq2seq_model.py index 61160e2..fc69eb2 100644 --- a/tf_chatbot/lib/seq2seq_model.py +++ b/tf_chatbot/lib/seq2seq_model.py @@ -10,7 +10,7 @@ import tf_chatbot.lib.data_utils as data_utils from tensorflow.contrib.legacy_seq2seq import sequence_loss, attention_decoder -from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper +from tensorflow.contrib.rnn import GRUCell, BasicLSTMCell, MultiRNNCell, EmbeddingWrapper, static_rnn, OutputProjectionWrapper, static_bidirectional_rnn def _extract_argmax_and_embed(embedding, @@ -104,6 +104,7 @@ def __init__(self, learning_rate, learning_rate_decay_factor, use_lstm=False, + use_bidirection=True, num_samples=512, use_sample=False, forward_only=False, @@ -128,9 +129,10 @@ def __init__(self, # Sampled softmax only makes sense if we sample less than vocabulary # size. if num_samples > 0 and num_samples < self.target_vocab_size: - w_t = tf.get_variable( - "proj_w", [ - self.target_vocab_size, size], dtype=dtype) + if use_bidirection: + w_t = tf.get_variable("proj_w", [self.target_vocab_size, size * 2], dtype=dtype) + else: + w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable( "proj_b", [ @@ -187,6 +189,7 @@ def embedding_attention_sampled_seq2seq( output_projection=None, feed_previous=False, initial_state_attention=False, + use_bidirection=False, dtype=tf.float32): with tf.variable_scope("embedding_attention_sampled_seq2seq"): encoder_cell = EmbeddingWrapper( @@ -194,8 +197,18 @@ def embedding_attention_sampled_seq2seq( embedding_classes=num_encoder_symbols, embedding_size=embedding_size ) - encoder_outputs, encoder_state = static_rnn( - encoder_cell, encoder_inputs, dtype=dtype) + if not use_bidirection: + encoder_outputs, encoder_state = static_rnn( + encoder_cell, encoder_inputs, dtype=dtype) + else: + encoder_outputs, encoder_state_fw, encoder_state_bw = static_bidirectional_rnn( + cell_fw=encoder_cell, + cell_bw=encoder_cell, + inputs=encoder_inputs, + dtype=dtype) + encoder_state = tf.concat([encoder_state_fw, encoder_state_bw], axis=1) + + cell = GRUCell(cell.state_size * 2) top_states = [tf.reshape( e, [-1, 1, cell.output_size]) for e in encoder_outputs] @@ -239,7 +252,8 @@ def embedding_attention_sampled_seq2seq( embedding_size=size, bucket_index=bucket_id, output_projection=output_projection, - feed_previous=do_decode) + feed_previous=do_decode, + use_bidirection=use_bidirection) # Feeds for inputs. self.encoder_inputs = []