⚡ Add gradients accumulation for CTC models

nglehuy · nglehuy · commit 74ec27179bf0 · 2020-11-20T20:48:45.000+07:00
diff --git a/examples/deepspeech2/config.yml b/examples/deepspeech2/config.yml
@@ -67,8 +67,9 @@ learning_config:
       learning_rate: 0.0001
 
   running_config:
-    batch_size: 8
+    batch_size: 4
     num_epochs: 20
+    accumulation_steps: 8
     outdir: /mnt/d/SpeechProcessing/Trained/local/deepspeech2
     log_interval_steps: 400
     save_interval_steps: 400
diff --git a/examples/deepspeech2/train_ga_ds2.py b/examples/deepspeech2/train_ga_ds2.py
@@ -0,0 +1,114 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+from tensorflow_asr.utils import setup_environment, setup_strategy
+
+setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Deep Speech 2 Training")
+
+parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML,
+                    help="The file path of model configuration file")
+
+parser.add_argument("--max_ckpts", type=int, default=10,
+                    help="Max number of checkpoints to keep")
+
+parser.add_argument("--tbs", type=int, default=None,
+                    help="Train batch size per replicas")
+
+parser.add_argument("--ebs", type=int, default=None,
+                    help="Evaluation batch size per replicas")
+
+parser.add_argument("--acs", type=int, default=None,
+                    help="Train accumulation steps")
+
+parser.add_argument("--tfrecords", default=False, action="store_true",
+                    help="Whether to use tfrecords dataset")
+
+parser.add_argument("--devices", type=int, nargs="*", default=[0],
+                    help="Devices' ids to apply distributed training")
+
+parser.add_argument("--mxp", default=False, action="store_true",
+                    help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true",
+                    help="Enable caching for dataset")
+
+args = parser.parse_args()
+
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
+
+strategy = setup_strategy(args.devices)
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
+from tensorflow_asr.runners.ctc_runners import CTCTrainerGA
+from tensorflow_asr.models.deepspeech2 import DeepSpeech2
+
+config = Config(args.config, learning=True)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+text_featurizer = CharFeaturizer(config.decoder_config)
+
+if args.tfrecords:
+    train_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+else:
+    train_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        data_paths=config.learning_config.dataset_config.train_paths,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+
+ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config)
+# Build DS2 model
+with ctc_trainer.strategy.scope():
+    ds2_model = DeepSpeech2(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    ds2_model._build(speech_featurizer.shape)
+    ds2_model.summary(line_length=120)
+# Compile
+ctc_trainer.compile(ds2_model, config.learning_config.optimizer_config,
+                    max_to_keep=args.max_ckpts)
+
+ctc_trainer.fit(train_dataset, eval_dataset,
+                train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/examples/jasper/config.yml b/examples/jasper/config.yml
@@ -74,8 +74,9 @@ learning_config:
       learning_rate: 0.0001
 
   running_config:
-    batch_size: 8
+    batch_size: 4
     num_epochs: 20
+    accumulation_steps: 8
     outdir: /mnt/d/SpeechProcessing/Trained/local/jasper
     log_interval_steps: 400
     save_interval_steps: 400
diff --git a/examples/jasper/train_ga_jasper.py b/examples/jasper/train_ga_jasper.py
@@ -0,0 +1,114 @@
+# Copyright 2020 Huy Le Nguyen (@usimarit)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+from tensorflow_asr.utils import setup_environment, setup_strategy
+
+setup_environment()
+import tensorflow as tf
+
+DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
+
+tf.keras.backend.clear_session()
+
+parser = argparse.ArgumentParser(prog="Jasper Training")
+
+parser.add_argument("--config", "-c", type=str, default=DEFAULT_YAML,
+                    help="The file path of model configuration file")
+
+parser.add_argument("--max_ckpts", type=int, default=10,
+                    help="Max number of checkpoints to keep")
+
+parser.add_argument("--tbs", type=int, default=None,
+                    help="Train batch size per replicas")
+
+parser.add_argument("--ebs", type=int, default=None,
+                    help="Evaluation batch size per replicas")
+
+parser.add_argument("--acs", type=int, default=None,
+                    help="Train accumulation steps")
+
+parser.add_argument("--tfrecords", default=False, action="store_true",
+                    help="Whether to use tfrecords dataset")
+
+parser.add_argument("--devices", type=int, nargs="*", default=[0],
+                    help="Devices' ids to apply distributed training")
+
+parser.add_argument("--mxp", default=False, action="store_true",
+                    help="Enable mixed precision")
+
+parser.add_argument("--cache", default=False, action="store_true",
+                    help="Enable caching for dataset")
+
+args = parser.parse_args()
+
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
+
+strategy = setup_strategy(args.devices)
+
+from tensorflow_asr.configs.config import Config
+from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
+from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
+from tensorflow_asr.runners.ctc_runners import CTCTrainerGA
+from tensorflow_asr.models.jasper import Jasper
+
+config = Config(args.config, learning=True)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+text_featurizer = CharFeaturizer(config.decoder_config)
+
+if args.tfrecords:
+    train_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.train_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRTFRecordDataset(
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+else:
+    train_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        data_paths=config.learning_config.dataset_config.train_paths,
+        augmentations=config.learning_config.augmentations,
+        stage="train", cache=args.cache, shuffle=True
+    )
+    eval_dataset = ASRSliceDataset(
+        speech_featurizer=speech_featurizer,
+        text_featurizer=text_featurizer,
+        data_paths=config.learning_config.dataset_config.eval_paths,
+        stage="eval", cache=args.cache, shuffle=True
+    )
+
+ctc_trainer = CTCTrainerGA(text_featurizer, config.learning_config.running_config)
+# Build DS2 model
+with ctc_trainer.strategy.scope():
+    jasper = Jasper(**config.model_config, vocabulary_size=text_featurizer.num_classes)
+    jasper._build(speech_featurizer.shape)
+    jasper.summary(line_length=120)
+# Compile
+ctc_trainer.compile(jasper, config.learning_config.optimizer_config,
+                    max_to_keep=args.max_ckpts)
+
+ctc_trainer.fit(train_dataset, eval_dataset,
+                train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
diff --git a/tensorflow_asr/optimizers/accumulation.py b/tensorflow_asr/optimizers/accumulation.py
@@ -21,15 +21,16 @@ def __init__(self, trainable_variables):
             tf.Variable(
                 tf.zeros_like(g),
                 trainable=False,
-                synchronization=tf.VariableSynchronization.ON_READ
+                synchronization=tf.VariableSynchronization.ON_READ,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
             ) for g in trainable_variables
         ]
 
     def reset(self):
         for i, g in enumerate(self.gradients):
-            self.gradients[i].assign(tf.zeros_like(g))
+            self.gradients[i].assign(tf.zeros_like(g), read_value=False)
 
     def accumulate(self, step_gradients):
         for i, g in enumerate(step_gradients):
             if g is None: continue
-            self.gradients[i].assign_add(g)
+            self.gradients[i].assign_add(g, read_value=False)
diff --git a/tensorflow_asr/runners/ctc_runners.py b/tensorflow_asr/runners/ctc_runners.py
@@ -19,6 +19,7 @@
 from ..featurizers.text_featurizers import TextFeaturizer
 from ..losses.ctc_losses import ctc_loss
 from .base_runners import BaseTrainer
+from ..optimizers.accumulation import GradientAccumulation
 
 
 class CTCTrainer(BaseTrainer):
@@ -89,3 +90,49 @@ def compile(self, model: tf.keras.Model,
             self.model = model
             self.optimizer = tf.keras.optimizers.get(optimizer)
         self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
+
+
+class CTCTrainerGA(CTCTrainer):
+    """ Trainer for CTC Models """
+
+    @tf.function
+    def _train_function(self, iterator):
+        for _ in range(self.config.accumulation_steps):
+            batch = next(iterator)
+            self.strategy.run(self._train_step, args=(batch,))
+        self.strategy.run(self._apply_gradients, args=())
+
+    @tf.function
+    def _apply_gradients(self):
+        self.optimizer.apply_gradients(
+            zip(self.accumulation.gradients, self.model.trainable_variables))
+        self.accumulation.reset()
+
+    @tf.function(experimental_relax_shapes=True)
+    def _train_step(self, batch):
+        _, features, input_length, labels, label_length, _ = batch
+
+        with tf.GradientTape() as tape:
+            y_pred = self.model(features, training=True)
+            tape.watch(y_pred)
+            per_train_loss = ctc_loss(
+                y_true=labels, y_pred=y_pred,
+                input_length=(input_length // self.model.time_reduction_factor),
+                label_length=label_length,
+                blank=self.text_featurizer.blank
+            )
+            train_loss = tf.nn.compute_average_loss(per_train_loss,
+                                                    global_batch_size=self.global_batch_size)
+
+        gradients = tape.gradient(train_loss, self.model.trainable_variables)
+        self.accumulation.accumulate(gradients)
+        self.train_metrics["ctc_loss"].update_state(per_train_loss)
+
+    def compile(self, model: tf.keras.Model,
+                optimizer: any,
+                max_to_keep: int = 10):
+        with self.strategy.scope():
+            self.model = model
+            self.optimizer = tf.keras.optimizers.get(optimizer)
+        self.create_checkpoint_manager(max_to_keep, model=self.model, optimizer=self.optimizer)
+        self.accumulation = GradientAccumulation(self.model.trainable_variables)