bug fixes

Ryan Sepassi · Ryan Sepassi · commit aee16b4da024 · 2017-06-15T17:50:37.000-07:00
diff --git a/README.md b/README.md
@@ -17,33 +17,53 @@ issues](https://github.com/tensorflow/tensor2tensor/issues).
 ```
 pip install tensor2tensor
 
-DATA_DIR=$HOME/data
+DATA_DIR=$HOME/t2t_data
+TMP_DIR=/tmp/t2t_datagen
 PROBLEM=wmt_ende_tokens_32k
 MODEL=transformer
 HPARAMS=transformer_base
-TRAIN_DIR=$HOME/train
+TRAIN_DIR=$HOME/t2t_train/$PROBLEM_$MODEL_$HPARAMS
+
+mkdir $DATA_DIR $TMP_DIR $TRAIN_DIR
 
 # Generate data
 t2t-datagen \
   --data_dir=$DATA_DIR \
+  --tmp_dir=$TMP_DIR \
   --problem=$PROBLEM
 
+mv $TMP_DIR/tokens.vocab.32768 $DATA_DIR
+
 # Train
 t2t-trainer \
   --data_dir=$DATA_DIR \
   --problems=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
-  --output_dir=$TRAIN_DIR \
+  --output_dir=$TRAIN_DIR
 
 # Decode
+
+DECODE_FILE=$DATA_DIR/decode_this.txt
+echo "Hello world" >> $DECODE_FILE
+echo "Goodbye world" >> $DECODE_FILE
+
+BEAM_SIZE=4
+ALPHA=0.6
+
 t2t-trainer \
   --data_dir=$DATA_DIR \
   --problems=$PROBLEM \
   --model=$MODEL \
   --hparams_set=$HPARAMS \
   --output_dir=$TRAIN_DIR \
-  --decode_from_file=$DATA_DIR/decode_this.txt
+  --train_steps=0 \
+  --eval_steps=0 \
+  --beam_size=$BEAM_SIZE \
+  --alpha=$ALPHA \
+  --decode_from_file=$DECODE_FILE
+
+cat $DECODE_FILE.$MODEL.$HPARAMS.beam$BEAM_SIZE.alpha$ALPHA.decodes
 ```
 
 T2T modularizes training into several components, each of which can be seen in
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.0.1.dev1',
+    version='1.0.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
@@ -18,12 +18,11 @@
         'six',
         'tensorflow-gpu>=1.2.0rc1',
     ],
-    classifiers = [
+    classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
     ],
-    keywords='tensorflow',
-)
+    keywords='tensorflow',)
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -47,8 +47,8 @@ flags = tf.flags
 FLAGS = flags.FLAGS
 
 flags.DEFINE_string("data_dir", "", "Data directory.")
-flags.DEFINE_string("tmp_dir",
-                    tempfile.gettempdir(), "Temporary storage directory.")
+flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
+                    "Temporary storage directory.")
 flags.DEFINE_string("problem", "",
                     "The name of the problem to generate data for.")
 flags.DEFINE_integer("num_shards", 1, "How many shards to use.")
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -229,7 +229,10 @@ def subtoken_to_subtoken_string(self, subtoken):
         self._all_subtoken_strings[subtoken]):
       return self._all_subtoken_strings[subtoken]
     else:
-      return 'ID%d_' % subtoken
+      if 0 <= subtoken < self._num_reserved_ids:
+        return '%s_' % RESERVED_TOKENS[subtoken]
+      else:
+        return 'ID%d_' % subtoken
 
   def _escaped_token_to_subtokens(self, escaped_token):
     """Converts an escaped token string to a list of subtokens.
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
@@ -203,17 +203,19 @@ def _compile_data(tmp_dir, datasets, filename):
   with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
     i = 0
     while i <= len(lang1_lines):
-      lang1_file.writelines(
-          lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size])
+      for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
+        lang1_file.write(line)
       i += 1
-    lang1_file.writelines(lang1_lines[i * write_chunk_size:])
+    for line in lang1_lines[i * write_chunk_size:]:
+      lang1_file.write(line)
   with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
     i = 0
     while i <= len(lang2_lines):
-      lang2_file.writelines(
-          lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size])
+      for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
+        lang2_file.write(line)
       i += 1
-    lang2_file.writelines(lang2_lines[i * write_chunk_size:])
+    for line in lang2_lines[i * write_chunk_size:]:
+      lang2_file.write(line)
   return filename
 
 
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
@@ -656,8 +656,7 @@ def log_fn(inputs, outputs):
       base_filename = FLAGS.decode_from_file
     decode_filename = (
         base_filename + "." + FLAGS.model + "." + FLAGS.hparams_set + ".beam" +
-        str(FLAGS.beam_size) + ".a" + str(FLAGS.alpha) + ".alpha" +
-        str(FLAGS.alpha) + ".decodes")
+        str(FLAGS.beam_size) + ".alpha" + str(FLAGS.alpha) + ".decodes")
     tf.logging.info("Writing decodes into %s" % decode_filename)
     outfile = tf.gfile.Open(decode_filename, "w")
     for index in range(len(sorted_inputs)):