|
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | 15 | import tensorflow as tf |
| 16 | +import tensorflow_text as tft |
| 17 | +from tensorflow.lite.python import interpreter |
16 | 18 |
|
17 | 19 | from tensorflow_asr.utils import cli_util, data_util |
18 | 20 |
|
19 | 21 | logger = tf.get_logger() |
20 | 22 |
|
21 | 23 |
|
22 | 24 | def main( |
23 | | - file_path: str, |
24 | | - tflite_path: str, |
25 | | - previous_encoder_states_shape: list = None, |
26 | | - previous_decoder_states_shape: list = None, |
27 | | - blank_index: int = 0, |
| 25 | + audio_file_path: str, |
| 26 | + tflite: str, |
| 27 | + sample_rate: int = 16000, |
| 28 | + blank: int = 0, |
28 | 29 | ): |
29 | | - tflitemodel = tf.lite.Interpreter(model_path=tflite_path) |
30 | | - signal = data_util.read_raw_audio(file_path) |
| 30 | + wav = data_util.load_and_convert_to_wav(audio_file_path, sample_rate=sample_rate) |
| 31 | + signal = data_util.read_raw_audio(wav) |
31 | 32 | signal = tf.reshape(signal, [1, -1]) |
32 | 33 | signal_length = tf.reshape(tf.shape(signal)[1], [1]) |
33 | 34 |
|
| 35 | + tflitemodel = interpreter.InterpreterWithCustomOps(model_path=tflite, custom_op_registerers=tft.tflite_registrar.SELECT_TFTEXT_OPS) |
34 | 36 | input_details = tflitemodel.get_input_details() |
35 | 37 | output_details = tflitemodel.get_output_details() |
36 | | - tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape) |
| 38 | + |
| 39 | + tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape, strict=True) |
37 | 40 | tflitemodel.allocate_tensors() |
38 | 41 | tflitemodel.set_tensor(input_details[0]["index"], signal) |
39 | 42 | tflitemodel.set_tensor(input_details[1]["index"], signal_length) |
40 | | - tflitemodel.set_tensor(input_details[2]["index"], tf.constant(blank_index, dtype=tf.int32)) |
41 | | - if previous_encoder_states_shape: |
42 | | - tflitemodel.set_tensor(input_details[4]["index"], tf.zeros(previous_encoder_states_shape, dtype=tf.float32)) |
43 | | - if previous_decoder_states_shape: |
44 | | - tflitemodel.set_tensor(input_details[5]["index"], tf.zeros(previous_decoder_states_shape, dtype=tf.float32)) |
| 43 | + tflitemodel.set_tensor(input_details[2]["index"], tf.ones(input_details[2]["shape"], dtype=input_details[2]["dtype"]) * blank) |
| 44 | + tflitemodel.set_tensor(input_details[3]["index"], tf.zeros(input_details[3]["shape"], dtype=input_details[3]["dtype"])) |
| 45 | + tflitemodel.set_tensor(input_details[4]["index"], tf.zeros(input_details[4]["shape"], dtype=input_details[4]["dtype"])) |
| 46 | + |
45 | 47 | tflitemodel.invoke() |
46 | | - hyp = tflitemodel.get_tensor(output_details[0]["index"]) |
47 | 48 |
|
48 | | - transcript = "".join([chr(u) for u in hyp]) |
| 49 | + transcript = tflitemodel.get_tensor(output_details[0]["index"]) |
| 50 | + tokens = tflitemodel.get_tensor(output_details[1]["index"]) |
| 51 | + next_tokens = tflitemodel.get_tensor(output_details[2]["index"]) |
| 52 | + if len(output_details) > 4: |
| 53 | + next_encoder_states = tflitemodel.get_tensor(output_details[3]["index"]) |
| 54 | + next_decoder_states = tflitemodel.get_tensor(output_details[4]["index"]) |
| 55 | + elif len(output_details) > 3: |
| 56 | + next_encoder_states = None |
| 57 | + next_decoder_states = tflitemodel.get_tensor(output_details[3]["index"]) |
| 58 | + else: |
| 59 | + next_encoder_states = None |
| 60 | + next_decoder_states = None |
| 61 | + |
49 | 62 | logger.info(f"Transcript: {transcript}") |
50 | | - return transcript |
| 63 | + logger.info(f"Tokens: {tokens}") |
| 64 | + logger.info(f"Next tokens: {next_tokens}") |
| 65 | + logger.info(f"Next encoder states: {None if next_encoder_states is None else next_encoder_states.shape}") |
| 66 | + logger.info(f"Next decoder states: {None if next_decoder_states is None else next_decoder_states.shape}") |
51 | 67 |
|
52 | 68 |
|
53 | 69 | if __name__ == "__main__": |
|
0 commit comments