✍️ fix subword featurizer iextract

nglehuy · nglehuy · commit bc825a3afa6a · 2021-04-02T00:30:54.000+07:00
diff --git a/tensorflow_asr/datasets/base_dataset.py b/tensorflow_asr/datasets/base_dataset.py
@@ -37,10 +37,13 @@ def __init__(self,
                  stage: str = "train",
                  **kwargs):
         self.data_paths = data_paths or []
+        if not isinstance(self.data_paths, list):
+            raise ValueError('data_paths must be a list of string paths')
         self.augmentations = augmentations  # apply augmentation
         self.cache = cache  # whether to cache transformed dataset to memory
         self.shuffle = shuffle  # whether to shuffle tf.data.Dataset
-        if buffer_size <= 0 and shuffle: raise ValueError("buffer_size must be positive when shuffle is on")
+        if buffer_size <= 0 and shuffle:
+            raise ValueError("buffer_size must be positive when shuffle is on")
         self.buffer_size = buffer_size  # shuffle buffer size
         self.stage = stage  # for defining tfrecords files
         self.use_tf = use_tf
diff --git a/tensorflow_asr/featurizers/text_featurizers.py b/tensorflow_asr/featurizers/text_featurizers.py
@@ -295,8 +295,10 @@ def iextract(self, indices: tf.Tensor) -> tf.Tensor:
             def cond(batch, total, _): return tf.less(batch, total)
 
             def body(batch, total, transcripts):
-                upoints = self.indices2upoints(indices[batch])
-                transcripts = transcripts.write(batch, tf.strings.unicode_encode(upoints, "UTF-8"))
+                norm_indices = self.normalize_indices(indices[batch])
+                norm_indices = tf.gather_nd(norm_indices, tf.where(tf.not_equal(norm_indices, 0)))
+                decoded = tf.numpy_function(self.subwords.decode, inp=[norm_indices], Tout=tf.string)
+                transcripts = transcripts.write(batch, decoded)
                 return batch + 1, total, transcripts
 
             _, _, transcripts = tf.while_loop(cond, body, loop_vars=[batch, total, transcripts])
diff --git a/tensorflow_asr/models/transducer.py b/tensorflow_asr/models/transducer.py
@@ -456,14 +456,11 @@ def _perform_greedy_batch(self,
                               encoded: tf.Tensor,
                               encoded_length: tf.Tensor,
                               parallel_iterations: int = 10,
-                              swap_memory: bool = False,
-                              version: str = 'v1'):
+                              swap_memory: bool = False):
         with tf.name_scope(f"{self.name}_perform_greedy_batch"):
             total_batch = tf.shape(encoded)[0]
             batch = tf.constant(0, dtype=tf.int32)
 
-            greedy_fn = self._perform_greedy if version == 'v1' else self._perform_greedy_v2
-
             decoded = tf.TensorArray(
                 dtype=tf.int32, size=total_batch, dynamic_size=False,
                 clear_after_read=False, element_shape=tf.TensorShape([None])
@@ -472,7 +469,7 @@ def _perform_greedy_batch(self,
             def condition(batch, _): return tf.less(batch, total_batch)
 
             def body(batch, decoded):
-                hypothesis = greedy_fn(
+                hypothesis = self._perform_greedy(
                     encoded=encoded[batch],
                     encoded_length=encoded_length[batch],
                     predicted=tf.constant(self.text_featurizer.blank, dtype=tf.int32),