TensorSpeech
diff --git a/‎examples/conformer/config.yml‎
Lines changed: 9 additions & 7 deletions b/‎examples/conformer/config.yml‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎examples/conformer/train_tpu_keras_subword_conformer.py‎
Lines changed: 19 additions & 10 deletions b/‎examples/conformer/train_tpu_keras_subword_conformer.py‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎examples/contextnet/train_tpu_keras_subword_contextnet.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/contextnet/train_tpu_keras_subword_contextnet.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow_asr/augmentations/augments.py‎
Lines changed: 12 additions & 12 deletions b/‎tensorflow_asr/augmentations/augments.py‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎tensorflow_asr/datasets/base_dataset.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorflow_asr/datasets/base_dataset.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorflow_asr/featurizers/text_featurizers.py‎
Lines changed: 4 additions & 2 deletions b/‎tensorflow_asr/featurizers/text_featurizers.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tensorflow_asr/models/conformer.py‎
Lines changed: 21 additions & 4 deletions b/‎tensorflow_asr/models/conformer.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎tensorflow_asr/models/contextnet.py‎
Lines changed: 12 additions & 1 deletion b/‎tensorflow_asr/models/contextnet.py‎
Lines changed: 12 additions & 1 deletion
@@ -25,8 +25,8 @@ speech_config:
 
 decoder_config:
   vocabulary: null
-  target_vocab_size: 4096
-  max_subword_length: 4
+  target_vocab_size: 1000
+  max_subword_length: 10
   blank_at_zero: True
   beam_width: 5
   norm_score: True
@@ -40,7 +40,7 @@ model_config:
     filters: 144
     kernel_size: 3
     strides: 2
-  encoder_positional_encoding: sinusoid_concat
+  encoder_positional_encoding: sinusoid_concat_v2
   encoder_dmodel: 144
   encoder_num_blocks: 16
   encoder_head_size: 36
@@ -55,11 +55,12 @@ model_config:
   prediction_rnn_units: 320
   prediction_rnn_type: lstm
   prediction_rnn_implementation: 2
-  prediction_layer_norm: False
-  prediction_projection_units: 144
-  joint_dim: 640
-  prejoint_linear: False
+  prediction_layer_norm: True
+  prediction_projection_units: 0
+  joint_dim: 320
+  prejoint_linear: True
   joint_activation: tanh
+  joint_mode: add
 
 learning_config:
   train_dataset_config:
@@ -78,6 +79,7 @@ learning_config:
     tfrecords_dir: /mnt/Miscellanea/Datasets/Speech/LibriSpeech/tfrecords
     shuffle: True
     cache: True
+    cache_percent: 0.2
     buffer_size: 100
     drop_remainder: True
     stage: train
 
@@ -48,6 +48,8 @@
 
 parser.add_argument("--saved", type=str, default=None, help="Path to saved model")
 
+parser.add_argument("--validation", default=False, action="store_true", help="Enable validation dataset")
+
 args = parser.parse_args()
 
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp})
@@ -83,35 +85,42 @@
     **vars(config.learning_config.train_dataset_config),
     indefinite=True
 )
-eval_dataset = ASRTFRecordDatasetKeras(
-    speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
-    **vars(config.learning_config.eval_dataset_config),
-    indefinite=True
-)
+
+if args.validation:
+    eval_dataset = ASRTFRecordDatasetKeras(
+        speech_featurizer=speech_featurizer, text_featurizer=text_featurizer,
+        **vars(config.learning_config.eval_dataset_config),
+        indefinite=True
+    )
 
 if args.compute_lengths:
     train_dataset.update_lengths(args.metadata_prefix)
-    eval_dataset.update_lengths(args.metadata_prefix)
+    if args.validation:
+        eval_dataset.update_lengths(args.metadata_prefix)
 
 # Update metadata calculated from both train and eval datasets
 train_dataset.load_metadata(args.metadata_prefix)
-eval_dataset.load_metadata(args.metadata_prefix)
+if args.validation:
+    eval_dataset.load_metadata(args.metadata_prefix)
 
 batch_size = args.bs if args.bs is not None else config.learning_config.running_config.batch_size
 global_batch_size = batch_size
 global_batch_size *= strategy.num_replicas_in_sync
 
 train_data_loader = train_dataset.create(global_batch_size)
-eval_data_loader = eval_dataset.create(global_batch_size)
+eval_data_loader = eval_dataset.create(global_batch_size) if args.validation else None
+validation_steps = eval_dataset.total_steps if args.validation else None
 
 with strategy.scope():
     # build model
     conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
     conformer._build(speech_featurizer.shape, prediction_shape=text_featurizer.prepand_shape, batch_size=global_batch_size)
-    conformer.summary(line_length=120)
 
     if args.saved:
         conformer.load_weights(args.saved, by_name=True, skip_mismatch=True)
+        print('Load pretrained weights successfully')
+
+    conformer.summary(line_length=120)
 
     optimizer = tf.keras.optimizers.Adam(
         TransformerSchedule(
@@ -140,5 +149,5 @@
 conformer.fit(
     train_data_loader, epochs=config.learning_config.running_config.num_epochs,
     validation_data=eval_data_loader, callbacks=callbacks,
-    steps_per_epoch=train_dataset.total_steps, validation_steps=eval_dataset.total_steps
+    steps_per_epoch=train_dataset.total_steps, validation_steps=validation_steps
 )
@@ -57,7 +57,7 @@
 from tensorflow_asr.configs.config import Config
 from tensorflow_asr.datasets.keras import ASRTFRecordDatasetKeras
 from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
-from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer
+from tensorflow_asr.featurizers.text_featurizers import TFSubwordFeaturizer, SentencePieceFeaturizer
 from tensorflow_asr.models.keras.contextnet import ContextNet
 from tensorflow_asr.optimizers.schedules import TransformerSchedule
 
@@ -69,10 +69,10 @@
     text_featurizer = SentencePieceFeaturizer.load_from_file(config.decoder_config, args.subwords)
 elif args.subwords and os.path.exists(args.subwords):
     print("Loading subwords ...")
-    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
+    text_featurizer = TFSubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
 else:
     print("Generating subwords ...")
-    text_featurizer = SubwordFeaturizer.build_from_corpus(
+    text_featurizer = TFSubwordFeaturizer.build_from_corpus(
         config.decoder_config,
         corpus_files=args.subwords_corpus
     )
 
@@ -22,7 +22,7 @@
 
 setuptools.setup(
     name="TensorFlowASR",
-    version="0.8.1",
+    version="0.8.2",
     author="Huy Le Nguyen",
     author_email="[email protected]",
     description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",
 
@@ -40,29 +40,29 @@
 
 
 class TFAugmentationExecutor:
-    def __init__(self, augmentations: list):
+    def __init__(self, augmentations: list, prob: float = 0.5):
         self.augmentations = augmentations
+        self.prob = prob
 
     @tf.function
     def augment(self, inputs):
         outputs = inputs
         for au in self.augmentations:
-            outputs = au.augment(outputs)
+            p = tf.random.uniform([])
+            outputs = tf.where(tf.less(p, self.prob), au.augment(outputs), outputs)
         return outputs
 
 
 class Augmentation:
     def __init__(self, config: dict = None, use_tf: bool = False):
         if not config: config = {}
-        if use_tf:
-            self.before = self.tf_parse(config.pop("before", {}))
-            self.after = self.tf_parse(config.pop("after", {}))
-        else:
-            self.before = self.parse(config.pop("before", {}))
-            self.after = self.parse(config.pop("after", {}))
+        prob = float(config.pop("prob", 0.5))
+        parser = self.tf_parse if use_tf else self.parse
+        self.before = parser(config.pop("before", {}), prob=prob)
+        self.after = parser(config.pop("after", {}), prob=prob)
 
     @staticmethod
-    def parse(config: dict) -> list:
+    def parse(config: dict, prob: float = 0.5) -> naf.Sometimes:
         augmentations = []
         for key, value in config.items():
             au = AUGMENTATIONS.get(key, None)
@@ -71,10 +71,10 @@ def parse(config: dict) -> list:
                                f"Available augmentations: {AUGMENTATIONS.keys()}")
             aug = au(**value) if value is not None else au()
             augmentations.append(aug)
-        return naf.Sometimes(augmentations)
+        return naf.Sometimes(augmentations, pipeline_p=prob)
 
     @staticmethod
-    def tf_parse(config: dict) -> list:
+    def tf_parse(config: dict, prob: float = 0.5) -> TFAugmentationExecutor:
         augmentations = []
         for key, value in config.items():
             au = TFAUGMENTATIONS.get(key, None)
@@ -83,4 +83,4 @@ def tf_parse(config: dict) -> list:
                                f"Available tf augmentations: {TFAUGMENTATIONS.keys()}")
             aug = au(**value) if value is not None else au()
             augmentations.append(aug)
-        return TFAugmentationExecutor(augmentations)
+        return TFAugmentationExecutor(augmentations, prob=prob)
@@ -37,10 +37,13 @@ def __init__(self,
                  stage: str = "train",
                  **kwargs):
         self.data_paths = data_paths or []
+        if not isinstance(self.data_paths, list):
+            raise ValueError('data_paths must be a list of string paths')
         self.augmentations = augmentations  # apply augmentation
-        self.cache = cache  # whether to cache WHOLE transformed dataset to memory
+        self.cache = cache  # whether to cache transformed dataset to memory
         self.shuffle = shuffle  # whether to shuffle tf.data.Dataset
-        if buffer_size <= 0 and shuffle: raise ValueError("buffer_size must be positive when shuffle is on")
+        if buffer_size <= 0 and shuffle:
+            raise ValueError("buffer_size must be positive when shuffle is on")
         self.buffer_size = buffer_size  # shuffle buffer size
         self.stage = stage  # for defining tfrecords files
         self.use_tf = use_tf
 
@@ -295,8 +295,10 @@ def iextract(self, indices: tf.Tensor) -> tf.Tensor:
             def cond(batch, total, _): return tf.less(batch, total)
 
             def body(batch, total, transcripts):
-                upoints = self.indices2upoints(indices[batch])
-                transcripts = transcripts.write(batch, tf.strings.unicode_encode(upoints, "UTF-8"))
+                norm_indices = self.normalize_indices(indices[batch])
+                norm_indices = tf.gather_nd(norm_indices, tf.where(tf.not_equal(norm_indices, 0)))
+                decoded = tf.numpy_function(self.subwords.decode, inp=[norm_indices], Tout=tf.string)
+                transcripts = transcripts.write(batch, decoded)
                 return batch + 1, total, transcripts
 
             _, _, transcripts = tf.while_loop(cond, body, loop_vars=[batch, total, transcripts])
 
@@ -307,12 +307,17 @@ def __init__(self,
 
         if positional_encoding == "sinusoid":
             self.pe = PositionalEncoding(name=f"{name}_pe")
+        elif positional_encoding == "sinusoid_v2":
+            self.pe = PositionalEncoding(alpha=2, beta=0, name=f"{name}_pe")
         elif positional_encoding == "sinusoid_concat":
             self.pe = PositionalEncodingConcat(name=f"{name}_pe")
+        elif positional_encoding == "sinusoid_concat_v2":
+            self.pe = PositionalEncodingConcat(alpha=2, beta=-1, name=f"{name}_pe")
         elif positional_encoding == "subsampling":
             self.pe = tf.keras.layers.Activation("linear", name=f"{name}_pe")
         else:
-            raise ValueError("positional_encoding must be either 'sinusoid' or 'subsampling'")
+            raise ValueError("positional_encoding must be either 'sinusoid', \
+                'sinusoid_concat', 'sinusoid_v2', 'sinusoid_concat_v2' or 'subsampling'")
 
         self.linear = tf.keras.layers.Dense(
             dmodel, name=f"{name}_linear",
@@ -373,6 +378,7 @@ def __init__(self,
                  encoder_depth_multiplier: int = 1,
                  encoder_fc_factor: float = 0.5,
                  encoder_dropout: float = 0,
+                 encoder_trainable: bool = True,
                  prediction_embed_dim: int = 512,
                  prediction_embed_dropout: int = 0,
                  prediction_num_rnns: int = 1,
@@ -381,12 +387,16 @@ def __init__(self,
                  prediction_rnn_implementation: int = 2,
                  prediction_layer_norm: bool = True,
                  prediction_projection_units: int = 0,
+                 prediction_trainable: bool = True,
                  joint_dim: int = 1024,
                  joint_activation: str = "tanh",
                  prejoint_linear: bool = True,
+                 postjoint_linear: bool = False,
+                 joint_mode: str = "add",
+                 joint_trainable: bool = True,
                  kernel_regularizer=L2,
                  bias_regularizer=L2,
-                 name: str = "conformer_transducer",
+                 name: str = "conformer",
                  **kwargs):
         super(Conformer, self).__init__(
             encoder=ConformerEncoder(
@@ -402,7 +412,9 @@ def __init__(self,
                 fc_factor=encoder_fc_factor,
                 dropout=encoder_dropout,
                 kernel_regularizer=kernel_regularizer,
-                bias_regularizer=bias_regularizer
+                bias_regularizer=bias_regularizer,
+                trainable=encoder_trainable,
+                name=f"{name}_encoder"
             ),
             vocabulary_size=vocabulary_size,
             embed_dim=prediction_embed_dim,
@@ -413,12 +425,17 @@ def __init__(self,
             rnn_implementation=prediction_rnn_implementation,
             layer_norm=prediction_layer_norm,
             projection_units=prediction_projection_units,
+            prediction_trainable=prediction_trainable,
             joint_dim=joint_dim,
             joint_activation=joint_activation,
             prejoint_linear=prejoint_linear,
+            postjoint_linear=postjoint_linear,
+            joint_mode=joint_mode,
+            joint_trainable=joint_trainable,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
-            name=name, **kwargs
+            name=name,
+            **kwargs
         )
         self.dmodel = encoder_dmodel
         self.time_reduction_factor = self.encoder.conv_subsampling.time_reduction_factor
@@ -197,6 +197,7 @@ def __init__(self,
                  vocabulary_size: int,
                  encoder_blocks: List[dict],
                  encoder_alpha: float = 0.5,
+                 encoder_trainable: bool = True,
                  prediction_embed_dim: int = 512,
                  prediction_embed_dropout: int = 0,
                  prediction_num_rnns: int = 1,
@@ -205,9 +206,13 @@ def __init__(self,
                  prediction_rnn_implementation: int = 2,
                  prediction_layer_norm: bool = True,
                  prediction_projection_units: int = 0,
+                 prediction_trainable: bool = True,
                  joint_dim: int = 1024,
                  joint_activation: str = "tanh",
                  prejoint_linear: bool = True,
+                 postjoint_linear: bool = False,
+                 joint_mode: str = "add",
+                 joint_trainable: bool = True,
                  kernel_regularizer=L2,
                  bias_regularizer=L2,
                  name: str = "contextnet",
@@ -218,6 +223,7 @@ def __init__(self,
                 alpha=encoder_alpha,
                 kernel_regularizer=kernel_regularizer,
                 bias_regularizer=bias_regularizer,
+                trainable=encoder_trainable,
                 name=f"{name}_encoder"
             ),
             vocabulary_size=vocabulary_size,
@@ -228,13 +234,18 @@ def __init__(self,
             rnn_type=prediction_rnn_type,
             rnn_implementation=prediction_rnn_implementation,
             layer_norm=prediction_layer_norm,
+            prediction_trainable=prediction_trainable,
             projection_units=prediction_projection_units,
             joint_dim=joint_dim,
             joint_activation=joint_activation,
             prejoint_linear=prejoint_linear,
+            postjoint_linear=postjoint_linear,
+            joint_mode=joint_mode,
+            joint_trainable=joint_trainable,
             kernel_regularizer=kernel_regularizer,
             bias_regularizer=bias_regularizer,
-            name=name, **kwargs
+            name=name,
+            **kwargs
         )
         self.dmodel = self.encoder.blocks[-1].dmodel
         self.time_reduction_factor = 1