Document distributed training and update README

Ryan Sepassi · Ryan Sepassi · commit 1047fa6efa4d · 2017-06-16T11:03:37.000-07:00
diff --git a/README.md b/README.md
@@ -17,14 +17,14 @@ issues](https://github.com/tensorflow/tensor2tensor/issues).
 ```
 pip install tensor2tensor
 
-DATA_DIR=$HOME/t2t_data
-TMP_DIR=/tmp/t2t_datagen
 PROBLEM=wmt_ende_tokens_32k
 MODEL=transformer
 HPARAMS=transformer_base
-TRAIN_DIR=$HOME/t2t_train/$PROBLEM_$MODEL_$HPARAMS
+DATA_DIR=$HOME/t2t_data
+TMP_DIR=/tmp/t2t_datagen
+TRAIN_DIR=$HOME/t2t_train/$PROBLEM/$MODEL-$HPARAMS
 
-mkdir $DATA_DIR $TMP_DIR $TRAIN_DIR
+mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
 
 # Generate data
 t2t-datagen \
@@ -69,6 +69,10 @@ cat $DECODE_FILE.$MODEL.$HPARAMS.beam$BEAM_SIZE.alpha$ALPHA.decodes
 T2T modularizes training into several components, each of which can be seen in
 use in the above commands.
 
+See the models, problems, and hyperparameter sets that are available:
+
+`t2t-trainer --registry_help`
+
 ### Datasets
 
 **Datasets** are all standardized on TFRecord files with `tensorflow.Example`
@@ -118,7 +122,8 @@ The **trainer** binary is the main entrypoint for training, evaluation, and
 inference. Users can easily switch between problems, models, and hyperparameter
 sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
 hyperparameters can be overriden with the `--hparams` flag. `--schedule` and
-related flags control local and distributed training/evaluation.
+related flags control local and distributed training/evaluation
+([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/docs/distributed_training.md)).
 
 ## Adding a dataset
 
diff --git a/setup.py b/setup.py
@@ -1,6 +1,7 @@
 """Install tensor2tensor."""
 
-from setuptools import setup, find_packages
+from setuptools import find_packages
+from setuptools import setup
 
 setup(
     name='tensor2tensor',
diff --git a/tensor2tensor/bin/make_tf_configs.py b/tensor2tensor/bin/make_tf_configs.py
@@ -0,0 +1,82 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Output command line arguments and json-encoded TF_CONFIGs.
+
+Usage:
+
+`make_tf_configs.py --workers="server1:1234" --ps="server3:2134,server4:2334"`
+
+Outputs 1 line per job to stdout, first the workers, then the parameter servers.
+Each line has the TF_CONFIG, then a tab, then the command line flags for that
+job.
+
+If there is a single worker, workers will have the `--sync` flag.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+# Dependency imports
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("workers", "", "Comma-separated list of worker addresses")
+flags.DEFINE_string("ps", "", "Comma-separated list of ps addresses")
+
+
+def main(_):
+  if not (FLAGS.workers and FLAGS.ps):
+    raise ValueError("Must provide --workers and --ps")
+
+  workers = FLAGS.workers.split(",")
+  ps = FLAGS.ps.split(",")
+
+  cluster = {"ps": ps, "worker": workers}
+
+  for task_type, jobs in [("worker", workers), ("ps", ps)]:
+    for idx, job in enumerate(jobs):
+      if task_type == "worker":
+        cmd_line_flags = " ".join([
+            "--master=%s" % job,
+            "--ps_replicas=%d" % len(ps),
+            "--worker_replicas=%d" % len(workers),
+            "--worker_gpu=1",
+            "--worker_id=%d" % idx,
+            "--ps_gpu=1",
+            "--schedule=train",
+            "--sync" if len(workers) == 1 else "",
+        ])
+      else:
+        cmd_line_flags = " ".join([
+            "--schedule=run_std_server",
+        ])
+
+      tf_config = json.dumps({
+          "cluster": cluster,
+          "task": {
+              "type": task_type,
+              "index": idx
+          }
+      })
+      print(tf_config + "\t" + cmd_line_flags)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
@@ -42,7 +42,6 @@ def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
   utils.log_registry()
   utils.validate_flags()
-  # TODO(rsepassi): Document distributed training
   utils.run(
       data_dir=FLAGS.data_dir,
       model=FLAGS.model,
diff --git a/tensor2tensor/docs/distributed_training.md b/tensor2tensor/docs/distributed_training.md
@@ -0,0 +1,68 @@
+# Distributed Training
+
+The `t2t-trainer` supports both synchronous and asynchronous distributed
+training.
+
+T2T uses TensorFlow Estimators and so distributed training is configured with
+the `TF_CONFIG` environment variable that is read by the
+[RunConfig](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/estimators/run_config.py)
+along with a set of flags.
+
+## `TF_CONFIG`
+
+Both workers and parameter servers must have the `TF_CONFIG` environment
+variable set.
+
+The `TF_CONFIG` environment variable is a json-encoded string with the addresses
+of the workers and parameter servers (in the `'cluster'` key) and the
+identification of the current task (in the `'task'` key).
+
+For example:
+
+```
+cluster = {
+    'ps': ['host1:2222', 'host2:2222'],
+    'worker': ['host3:2222', 'host4:2222', 'host5:2222']
+}
+os.environ['TF_CONFIG'] = json.dumps({
+    'cluster': cluster,
+    'task': {'type': 'worker', 'index': 1}
+})
+```
+
+## Command-line flags
+
+The following T2T command-line flags must also be set on the workers for
+distributed training:
+
+- `--master=$ADDRESS`
+- `--worker_replicas=$NUM_WORKERS`
+- `--worker_gpu=$NUM_GPUS_PER_WORKER`
+- `--worker_id=$WORKER_ID`
+- `--ps_replicas=$NUM_PS`
+- `--ps_gpu=$NUM_GPUS_PER_PS`
+- `--schedule=train`
+- `--sync`, if you want synchronous training, i.e. for there to be a single
+  master worker coordinating the work across "ps" jobs (yes, the naming is
+  unfortunate). If not set, then each worker operates independently while
+  variables are shared on the parameter servers.
+
+Parameter servers only need `--schedule=run_std_server`.
+
+## Utility to produce `TF_CONFIG` and flags
+
+[`bin/make_tf_configs.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/make_tf_configs.py))
+generates the `TF_CONFIG` json strings and the above-mentioned command-line
+flags for the workers and parameter servers.
+
+## Command-line flags for eval jobs
+
+Eval jobs should set the following flags and do not need the `TF_CONFIG`
+environment variable to be set as the eval jobs run locally and do not
+communicate to the other jobs (the eval jobs read the model checkpoints that the
+trainer writes out):
+
+- `--schedule=continuous_eval_on_train_data` or
+  `--schedule=continuous_eval` (for test data)
+- `--worker_job='/job:localhost'`
+- `--output_dir=$TRAIN_DIR`
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
@@ -126,7 +126,7 @@ def experiment_fn(output_dir):
 
 def create_experiment(output_dir, data_dir, model_name, train_steps,
                       eval_steps):
-  hparams = create_hparams(FLAGS.hparams_set, FLAGS.data_dir)
+  hparams = create_hparams(FLAGS.hparams_set, data_dir)
   estimator, input_fns = create_experiment_components(
       hparams=hparams,
       output_dir=output_dir,
diff --git a/tensor2tensor/utils/trainer_utils_test.py b/tensor2tensor/utils/trainer_utils_test.py
@@ -20,14 +20,29 @@
 
 # Dependency imports
 
+from tensor2tensor.data_generators import algorithmic
+from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import trainer_utils as utils  # pylint: disable=unused-import
 
 import tensorflow as tf
 
+FLAGS = tf.flags.FLAGS
+
 
 class TrainerUtilsTest(tf.test.TestCase):
 
+  @classmethod
+  def setUpClass(cls):
+    # Generate a small test dataset
+    FLAGS.problems = "algorithmic_addition_binary40"
+    TrainerUtilsTest.data_dir = tf.test.get_temp_dir()
+    gen = algorithmic.identity_generator(2, 10, 300)
+    generator_utils.generate_files(gen, FLAGS.problems + "-train",
+                                   TrainerUtilsTest.data_dir, 1, 100)
+    generator_utils.generate_files(gen, FLAGS.problems + "-dev",
+                                   TrainerUtilsTest.data_dir, 1, 100)
+
   def testModelsImported(self):
     models = registry.list_models()
     self.assertTrue("baseline_lstm_seq2seq" in models)
@@ -36,6 +51,20 @@ def testHParamsImported(self):
     hparams = registry.list_hparams()
     self.assertTrue("transformer_base" in hparams)
 
+  def testSingleStep(self):
+    model_name = "transformer"
+    FLAGS.hparams_set = "transformer_base"
+    # Shrink the test model down
+    FLAGS.hparams = ("batch_size=10,hidden_size=10,num_heads=2,max_length=16,"
+                     "num_hidden_layers=1")
+    exp = utils.create_experiment(
+        output_dir=tf.test.get_temp_dir(),
+        data_dir=TrainerUtilsTest.data_dir,
+        model_name=model_name,
+        train_steps=1,
+        eval_steps=1)
+    exp.test()
+
 
 if __name__ == "__main__":
   tf.test.main()