From 2fd98de56f8b73e0402fb8c71296c3fa629592ed Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 25 Aug 2021 13:08:23 +0200 Subject: [PATCH] Split train.py into separate modules Currently train.py is overloaded with many independent features. Understanding the code and what will be the result of a training call requires untangling the entire script. It's also an error prone UX. This is a first step at separating independent parts into their own scripts. --- .github/actions/numpy_vers/action.yml | 10 +- bin/run-ci-ldc93s1_new.sh | 3 +- bin/run-ci-ldc93s1_new_bytes.sh | 3 +- bin/run-ldc93s1.py | 4 +- setup.py | 1 + .../coqui_stt_training/deepspeech_model.py | 403 +++++++++++ training/coqui_stt_training/evaluate.py | 29 +- training/coqui_stt_training/export.py | 216 ++++++ training/coqui_stt_training/train.py | 672 +----------------- .../training_graph_inference.py | 87 +++ training/coqui_stt_training/util/config.py | 2 +- training/coqui_stt_training/util/io.py | 7 + 12 files changed, 780 insertions(+), 657 deletions(-) create mode 100644 training/coqui_stt_training/deepspeech_model.py mode change 100755 => 100644 training/coqui_stt_training/evaluate.py create mode 100644 training/coqui_stt_training/export.py create mode 100644 training/coqui_stt_training/training_graph_inference.py diff --git a/.github/actions/numpy_vers/action.yml b/.github/actions/numpy_vers/action.yml index d93dfff7..41c7ebdd 100644 --- a/.github/actions/numpy_vers/action.yml +++ b/.github/actions/numpy_vers/action.yml @@ -28,15 +28,15 @@ runs: case "${{ inputs.pyver }}" in 3.7*) NUMPY_BUILD_VERSION="==1.14.5" - NUMPY_DEP_VERSION=">=1.14.5" + NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4" ;; 3.8*) NUMPY_BUILD_VERSION="==1.17.3" - NUMPY_DEP_VERSION=">=1.17.3" + NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4" ;; 3.9*) NUMPY_BUILD_VERSION="==1.19.4" - NUMPY_DEP_VERSION=">=1.19.4" + NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4" ;; esac ;; @@ -57,7 +57,7 @@ runs: ;; 3.9*) NUMPY_BUILD_VERSION="==1.19.4" - NUMPY_DEP_VERSION=">=1.19.4" + NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4" ;; esac ;; @@ -82,7 +82,7 @@ runs: ;; 3.9*) NUMPY_BUILD_VERSION="==1.19.4" - NUMPY_DEP_VERSION=">=1.19.4" + NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4" ;; esac ;; diff --git a/bin/run-ci-ldc93s1_new.sh b/bin/run-ci-ldc93s1_new.sh index a0261257..6c7ac939 100755 --- a/bin/run-ci-ldc93s1_new.sh +++ b/bin/run-ci-ldc93s1_new.sh @@ -27,4 +27,5 @@ python -u train.py --alphabet_config_path "data/alphabet.txt" \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \ - --audio_sample_rate ${audio_sample_rate} + --audio_sample_rate ${audio_sample_rate} \ + --export_tflite false diff --git a/bin/run-ci-ldc93s1_new_bytes.sh b/bin/run-ci-ldc93s1_new_bytes.sh index 5dec1fed..d08c729e 100755 --- a/bin/run-ci-ldc93s1_new_bytes.sh +++ b/bin/run-ci-ldc93s1_new_bytes.sh @@ -27,4 +27,5 @@ python -u train.py --show_progressbar false --early_stop false \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \ --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \ --audio_sample_rate ${audio_sample_rate} \ - --bytes_output_mode true + --bytes_output_mode true \ + --export_tflite false diff --git a/bin/run-ldc93s1.py b/bin/run-ldc93s1.py index 5a2746d8..b25cc998 100755 --- a/bin/run-ldc93s1.py +++ b/bin/run-ldc93s1.py @@ -2,7 +2,8 @@ import os from import_ldc93s1 import _download_and_preprocess_data as download_ldc from coqui_stt_training.util.config import initialize_globals_from_args -from coqui_stt_training.train import train, test +from coqui_stt_training.train import train +from coqui_stt_training.evaluate import test # only one GPU for only one training sample os.environ["CUDA_VISIBLE_DEVICES"] = "0" @@ -21,5 +22,4 @@ initialize_globals_from_args( ) train() - test() diff --git a/setup.py b/setup.py index 84b4364a..93cdf094 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ def main(): "coqpit", "numpy", "optuna", + "numba <= 0.53.1", "opuslib == 2.0.0", "pandas", "progressbar2", diff --git a/training/coqui_stt_training/deepspeech_model.py b/training/coqui_stt_training/deepspeech_model.py new file mode 100644 index 00000000..c0579f63 --- /dev/null +++ b/training/coqui_stt_training/deepspeech_model.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import sys + +LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0 +DESIRED_LOG_LEVEL = ( + sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3" +) +os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL + +import numpy as np +import tensorflow as tf +import tensorflow.compat.v1 as tfv1 + +tfv1.logging.set_verbosity( + { + "0": tfv1.logging.DEBUG, + "1": tfv1.logging.INFO, + "2": tfv1.logging.WARN, + "3": tfv1.logging.ERROR, + }.get(DESIRED_LOG_LEVEL) +) + +from .util.config import Config +from .util.feeding import audio_to_features + + +def variable_on_cpu(name, shape, initializer): + r""" + Next we concern ourselves with graph creation. + However, before we do so we must introduce a utility function ``variable_on_cpu()`` + used to create a variable in CPU memory. + """ + # Use the /cpu:0 device for scoped operations + with tf.device(Config.cpu_device): + # Create or get apropos variable + var = tfv1.get_variable(name=name, shape=shape, initializer=initializer) + return var + + +def create_overlapping_windows(batch_x): + batch_size = tf.shape(input=batch_x)[0] + window_width = 2 * Config.n_context + 1 + num_channels = Config.n_input + + # Create a constant convolution filter using an identity matrix, so that the + # convolution returns patches of the input tensor as is, and we can create + # overlapping windows over the MFCCs. + eye_filter = tf.constant( + np.eye(window_width * num_channels).reshape( + window_width, num_channels, window_width * num_channels + ), + tf.float32, + ) # pylint: disable=bad-continuation + + # Create overlapping windows + batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME") + + # Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input] + batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels]) + + return batch_x + + +def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False): + with tfv1.variable_scope(name): + bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer()) + weights = variable_on_cpu( + "weights", + [x.shape[-1], units], + tfv1.keras.initializers.VarianceScaling( + scale=1.0, mode="fan_avg", distribution="uniform" + ), + ) + + output = tf.nn.bias_add(tf.matmul(x, weights), bias) + + if relu: + output = tf.minimum(tf.nn.relu(output), Config.relu_clip) + + if layer_norm: + with tfv1.variable_scope(name): + output = tf.contrib.layers.layer_norm(output) + + if dropout_rate is not None: + output = tf.nn.dropout(output, rate=dropout_rate) + + return output + + +def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse): + with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"): + fw_cell = tf.contrib.rnn.LSTMBlockFusedCell( + Config.n_cell_dim, + forget_bias=0, + reuse=reuse, + name="cudnn_compatible_lstm_cell", + ) + + output, output_state = fw_cell( + inputs=x, + dtype=tf.float32, + sequence_length=seq_length, + initial_state=previous_state, + ) + + return output, output_state + + +def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _): + assert ( + previous_state is None + ) # 'Passing previous state not supported with CuDNN backend' + + # Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate + # the object it creates the variables, and then you just call it several times + # to enable variable re-use. Because all of our code is structure in an old + # school TensorFlow structure where you can just call tf.get_variable again with + # reuse=True to reuse variables, we can't easily make use of the object oriented + # way CudnnLSTM is implemented, so we save a singleton instance in the function, + # emulating a static function variable. + if not rnn_impl_cudnn_rnn.cell: + # Forward direction cell: + fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM( + num_layers=1, + num_units=Config.n_cell_dim, + input_mode="linear_input", + direction="unidirectional", + dtype=tf.float32, + ) + rnn_impl_cudnn_rnn.cell = fw_cell + + output, output_state = rnn_impl_cudnn_rnn.cell( + inputs=x, sequence_lengths=seq_length + ) + + return output, output_state + + +rnn_impl_cudnn_rnn.cell = None + + +def rnn_impl_static_rnn(x, seq_length, previous_state, reuse): + with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"): + # Forward direction cell: + fw_cell = tfv1.nn.rnn_cell.LSTMCell( + Config.n_cell_dim, + forget_bias=0, + reuse=reuse, + name="cudnn_compatible_lstm_cell", + ) + + # Split rank N tensor into list of rank N-1 tensors + x = [x[l] for l in range(x.shape[0])] + + output, output_state = tfv1.nn.static_rnn( + cell=fw_cell, + inputs=x, + sequence_length=seq_length, + initial_state=previous_state, + dtype=tf.float32, + scope="cell_0", + ) + + output = tf.concat(output, 0) + + return output, output_state + + +def create_model( + batch_x, + seq_length, + dropout, + reuse=False, + batch_size=None, + previous_state=None, + overlap=True, + rnn_impl=rnn_impl_lstmblockfusedcell, +): + layers = {} + + # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context] + if not batch_size: + batch_size = tf.shape(input=batch_x)[0] + + # Create overlapping feature windows if needed + if overlap: + batch_x = create_overlapping_windows(batch_x) + + # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`. + # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`. + + # Permute n_steps and batch_size + batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3]) + # Reshape to prepare input for first layer + batch_x = tf.reshape( + batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context] + ) # (n_steps*batch_size, n_input + 2*n_input*n_context) + layers["input_reshaped"] = batch_x + + # The next three blocks will pass `batch_x` through three hidden layers with + # clipped RELU activation and dropout. + layers["layer_1"] = layer_1 = dense( + "layer_1", + batch_x, + Config.n_hidden_1, + dropout_rate=dropout[0], + layer_norm=Config.layer_norm, + ) + layers["layer_2"] = layer_2 = dense( + "layer_2", + layer_1, + Config.n_hidden_2, + dropout_rate=dropout[1], + layer_norm=Config.layer_norm, + ) + layers["layer_3"] = layer_3 = dense( + "layer_3", + layer_2, + Config.n_hidden_3, + dropout_rate=dropout[2], + layer_norm=Config.layer_norm, + ) + + # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`, + # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`. + layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3]) + + # Run through parametrized RNN implementation, as we use different RNNs + # for training and inference + output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse) + + # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim] + # to a tensor of shape [n_steps*batch_size, n_cell_dim] + output = tf.reshape(output, [-1, Config.n_cell_dim]) + layers["rnn_output"] = output + layers["rnn_output_state"] = output_state + + # Now we feed `output` to the fifth hidden layer with clipped RELU activation + layers["layer_5"] = layer_5 = dense( + "layer_5", + output, + Config.n_hidden_5, + dropout_rate=dropout[5], + layer_norm=Config.layer_norm, + ) + + # Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits. + layers["layer_6"] = layer_6 = dense( + "layer_6", layer_5, Config.n_hidden_6, relu=False + ) + + # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6] + # to the slightly more useful shape [n_steps, batch_size, n_hidden_6]. + # Note, that this differs from the input in that it is time-major. + layer_6 = tf.reshape( + layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits" + ) + layers["raw_logits"] = layer_6 + + # Output shape: [n_steps, batch_size, n_hidden_6] + return layer_6, layers + + +def create_inference_graph(batch_size=1, n_steps=16, tflite=False): + batch_size = batch_size if batch_size > 0 else None + + # Create feature computation graph + + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + input_samples = tfv1.placeholder( + tf.float32, [Config.audio_window_samples], "input_samples" + ) + samples = tf.expand_dims(input_samples, -1) + mfccs, _ = audio_to_features(samples, Config.audio_sample_rate) + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + mfccs = tf.identity(mfccs, name="mfccs") + + # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input] + # This shape is read by the native_client in STT_CreateModel to know the + # value of n_steps, n_context and n_input. Make sure you update the code + # there if this shape is changed. + # + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + input_tensor = tfv1.placeholder( + tf.float32, + [ + batch_size, + n_steps if n_steps > 0 else None, + 2 * Config.n_context + 1, + Config.n_input, + ], + name="input_node", + ) + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths") + + if batch_size <= 0: + # no state management since n_step is expected to be dynamic too (see below) + previous_state = None + else: + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + previous_state_c = tfv1.placeholder( + tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c" + ) + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + previous_state_h = tfv1.placeholder( + tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h" + ) + + previous_state = tf.nn.rnn_cell.LSTMStateTuple( + previous_state_c, previous_state_h + ) + + # One rate per layer + no_dropout = [None] * 6 + + if tflite: + rnn_impl = rnn_impl_static_rnn + else: + rnn_impl = rnn_impl_lstmblockfusedcell + + logits, layers = create_model( + batch_x=input_tensor, + batch_size=batch_size, + seq_length=seq_length if not Config.export_tflite else None, + dropout=no_dropout, + previous_state=previous_state, + overlap=False, + rnn_impl=rnn_impl, + ) + + # TF Lite runtime will check that input dimensions are 1, 2 or 4 + # by default we get 3, the middle one being batch_size which is forced to + # one on inference graph, so remove that dimension + # + # native_client: this node's name and shape are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + if tflite: + logits = tf.squeeze(logits, [1]) + + # Apply softmax for CTC decoder + probs = tf.nn.softmax(logits, name="logits") + + if batch_size <= 0: + if tflite: + raise NotImplementedError( + "dynamic batch_size does not support tflite nor streaming" + ) + if n_steps > 0: + raise NotImplementedError( + "dynamic batch_size expect n_steps to be dynamic too" + ) + return ( + { + "input": input_tensor, + "input_lengths": seq_length, + }, + { + "outputs": probs, + }, + layers, + ) + + new_state_c, new_state_h = layers["rnn_output_state"] + new_state_c = tf.identity(new_state_c, name="new_state_c") + new_state_h = tf.identity(new_state_h, name="new_state_h") + + inputs = { + "input": input_tensor, + "previous_state_c": previous_state_c, + "previous_state_h": previous_state_h, + "input_samples": input_samples, + } + + if not Config.export_tflite: + inputs["input_lengths"] = seq_length + + outputs = { + "outputs": probs, + "new_state_c": new_state_c, + "new_state_h": new_state_h, + "mfccs": mfccs, + # Expose internal layers for downstream applications + "layer_3": layers["layer_3"], + "layer_5": layers["layer_5"], + } + + return inputs, outputs, layers diff --git a/training/coqui_stt_training/evaluate.py b/training/coqui_stt_training/evaluate.py old mode 100755 new mode 100644 index 4425582b..6c4f6dca --- a/training/coqui_stt_training/evaluate.py +++ b/training/coqui_stt_training/evaluate.py @@ -13,6 +13,7 @@ from six.moves import zip import tensorflow as tf +from .deepspeech_model import create_model from .util.augmentations import NormalizeSampleRate from .util.checkpoints import load_graph_for_evaluation from .util.config import ( @@ -168,25 +169,25 @@ def evaluate(test_csvs, create_model): return samples -def main(): - initialize_globals_from_cli() - - if not Config.test_files: - log_error( - "You need to specify what files to use for evaluation via " - "the --test_files flag." - ) - sys.exit(1) - - from .train import ( # pylint: disable=cyclic-import,import-outside-toplevel - create_model, - ) +def test(): + tfv1.reset_default_graph() samples = evaluate(Config.test_files, create_model) - if Config.test_output_file: save_samples_json(samples, Config.test_output_file) +def main(): + initialize_globals_from_cli() + + if not Config.test_files: + raise RuntimeError( + "You need to specify what files to use for evaluation via " + "the --test_files flag." + ) + + test() + + if __name__ == "__main__": main() diff --git a/training/coqui_stt_training/export.py b/training/coqui_stt_training/export.py new file mode 100644 index 00000000..22c31ad6 --- /dev/null +++ b/training/coqui_stt_training/export.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import sys + +LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0 +DESIRED_LOG_LEVEL = ( + sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3" +) +os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL + +import tensorflow as tf +import tensorflow.compat.v1 as tfv1 +import shutil + +from .deepspeech_model import create_inference_graph +from .util.checkpoints import load_graph_for_evaluation +from .util.config import Config, initialize_globals_from_cli, log_error, log_info +from .util.io import ( + open_remote, + rmtree_remote, + listdir_remote, + is_remote_path, + isdir_remote, +) + + +def file_relative_read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +def export(): + r""" + Restores the trained variables into a simpler graph that will be exported for serving. + """ + log_info("Exporting the model...") + + tfv1.reset_default_graph() + + inputs, outputs, _ = create_inference_graph( + batch_size=Config.export_batch_size, + n_steps=Config.n_steps, + tflite=Config.export_tflite, + ) + + graph_version = int(file_relative_read("GRAPH_VERSION").strip()) + assert graph_version > 0 + + # native_client: these nodes's names and shapes are part of the API boundary + # with the native client, if you change them you should sync changes with + # the C++ code. + outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version") + outputs["metadata_sample_rate"] = tf.constant( + [Config.audio_sample_rate], name="metadata_sample_rate" + ) + outputs["metadata_feature_win_len"] = tf.constant( + [Config.feature_win_len], name="metadata_feature_win_len" + ) + outputs["metadata_feature_win_step"] = tf.constant( + [Config.feature_win_step], name="metadata_feature_win_step" + ) + outputs["metadata_beam_width"] = tf.constant( + [Config.export_beam_width], name="metadata_beam_width" + ) + outputs["metadata_alphabet"] = tf.constant( + [Config.alphabet.Serialize()], name="metadata_alphabet" + ) + + if Config.export_language: + outputs["metadata_language"] = tf.constant( + [Config.export_language.encode("utf-8")], name="metadata_language" + ) + + # Prevent further graph changes + tfv1.get_default_graph().finalize() + + output_names_tensors = [ + tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor) + ] + output_names_ops = [ + op.name for op in outputs.values() if isinstance(op, tf.Operation) + ] + output_names = output_names_tensors + output_names_ops + + with tf.Session() as session: + # Restore variables from checkpoint + load_graph_for_evaluation(session) + + output_filename = Config.export_file_name + ".pb" + if Config.remove_export: + if isdir_remote(Config.export_dir): + log_info("Removing old export") + rmtree_remote(Config.export_dir) + + output_graph_path = os.path.join(Config.export_dir, output_filename) + + if not is_remote_path(Config.export_dir) and not os.path.isdir( + Config.export_dir + ): + os.makedirs(Config.export_dir) + + frozen_graph = tfv1.graph_util.convert_variables_to_constants( + sess=session, + input_graph_def=tfv1.get_default_graph().as_graph_def(), + output_node_names=output_names, + ) + + frozen_graph = tfv1.graph_util.extract_sub_graph( + graph_def=frozen_graph, dest_nodes=output_names + ) + + if not Config.export_tflite: + with open_remote(output_graph_path, "wb") as fout: + fout.write(frozen_graph.SerializeToString()) + else: + output_tflite_path = os.path.join( + Config.export_dir, output_filename.replace(".pb", ".tflite") + ) + + converter = tf.lite.TFLiteConverter( + frozen_graph, + input_tensors=inputs.values(), + output_tensors=outputs.values(), + ) + + if Config.export_quantize: + converter.optimizations = [tf.lite.Optimize.DEFAULT] + + # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite + converter.allow_custom_ops = True + tflite_model = converter.convert() + + with open_remote(output_tflite_path, "wb") as fout: + fout.write(tflite_model) + + log_info("Models exported at %s" % (Config.export_dir)) + + metadata_fname = os.path.join( + Config.export_dir, + "{}_{}_{}.md".format( + Config.export_author_id, + Config.export_model_name, + Config.export_model_version, + ), + ) + + model_runtime = "tflite" if Config.export_tflite else "tensorflow" + with open_remote(metadata_fname, "w") as f: + f.write("---\n") + f.write("author: {}\n".format(Config.export_author_id)) + f.write("model_name: {}\n".format(Config.export_model_name)) + f.write("model_version: {}\n".format(Config.export_model_version)) + f.write("contact_info: {}\n".format(Config.export_contact_info)) + f.write("license: {}\n".format(Config.export_license)) + f.write("language: {}\n".format(Config.export_language)) + f.write("runtime: {}\n".format(model_runtime)) + f.write("min_stt_version: {}\n".format(Config.export_min_stt_version)) + f.write("max_stt_version: {}\n".format(Config.export_max_stt_version)) + f.write( + "acoustic_model_url: \n" + ) + f.write( + "scorer_url: \n" + ) + f.write("---\n") + f.write("{}\n".format(Config.export_description)) + + log_info( + "Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format( + metadata_fname + ) + ) + + +def package_zip(): + # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip + export_dir = os.path.join( + os.path.abspath(Config.export_dir), "" + ) # Force ending '/' + if is_remote_path(export_dir): + log_error( + "Cannot package remote path zip %s. Please do this manually." % export_dir + ) + return + + zip_filename = os.path.dirname(export_dir) + + shutil.copy(Config.scorer_path, export_dir) + + archive = shutil.make_archive(zip_filename, "zip", export_dir) + log_info("Exported packaged model {}".format(archive)) + + +def main(_): + initialize_globals_from_cli() + + if not Config.export_dir: + raise RuntimeError( + "Calling export script directly but no --export_dir specified" + ) + + if not Config.export_zip: + # Export to folder + export() + else: + if listdir_remote(Config.export_dir): + raise RuntimeError( + "Directory {} is not empty, please fix this.".format(Config.export_dir) + ) + + export() + package_zip() + + +if __name__ == "__main__": + main() diff --git a/training/coqui_stt_training/train.py b/training/coqui_stt_training/train.py index acfe15e2..6032184c 100644 --- a/training/coqui_stt_training/train.py +++ b/training/coqui_stt_training/train.py @@ -14,12 +14,13 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL import json import shutil import time +from datetime import datetime import numpy as np import progressbar import tensorflow.compat.v1 as tfv1 - import tensorflow as tf +from coqui_stt_ctcdecoder import Scorer tfv1.logging.set_verbosity( { @@ -30,12 +31,15 @@ tfv1.logging.set_verbosity( }.get(DESIRED_LOG_LEVEL) ) -from datetime import datetime -from coqui_stt_ctcdecoder import Scorer, ctc_beam_search_decoder -from six.moves import range, zip - -from .evaluate import evaluate +from . import evaluate +from . import export +from . import training_graph_inference +from .deepspeech_model import ( + create_model, + rnn_impl_lstmblockfusedcell, + rnn_impl_cudnn_rnn, +) from .util.augmentations import NormalizeSampleRate from .util.checkpoints import ( load_graph_for_evaluation, @@ -52,260 +56,16 @@ from .util.config import ( log_progress, log_warn, ) -from .util.evaluate_tools import save_samples_json -from .util.feeding import audio_to_features, audiofile_to_features, create_dataset +from .util.feeding import create_dataset from .util.helpers import ExceptionBox, check_ctcdecoder_version from .util.io import ( is_remote_path, - isdir_remote, - listdir_remote, open_remote, remove_remote, ) check_ctcdecoder_version() -# Graph Creation -# ============== - - -def variable_on_cpu(name, shape, initializer): - r""" - Next we concern ourselves with graph creation. - However, before we do so we must introduce a utility function ``variable_on_cpu()`` - used to create a variable in CPU memory. - """ - # Use the /cpu:0 device for scoped operations - with tf.device(Config.cpu_device): - # Create or get apropos variable - var = tfv1.get_variable(name=name, shape=shape, initializer=initializer) - return var - - -def create_overlapping_windows(batch_x): - batch_size = tf.shape(input=batch_x)[0] - window_width = 2 * Config.n_context + 1 - num_channels = Config.n_input - - # Create a constant convolution filter using an identity matrix, so that the - # convolution returns patches of the input tensor as is, and we can create - # overlapping windows over the MFCCs. - eye_filter = tf.constant( - np.eye(window_width * num_channels).reshape( - window_width, num_channels, window_width * num_channels - ), - tf.float32, - ) # pylint: disable=bad-continuation - - # Create overlapping windows - batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME") - - # Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input] - batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels]) - - return batch_x - - -def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False): - with tfv1.variable_scope(name): - bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer()) - weights = variable_on_cpu( - "weights", - [x.shape[-1], units], - tfv1.keras.initializers.VarianceScaling( - scale=1.0, mode="fan_avg", distribution="uniform" - ), - ) - - output = tf.nn.bias_add(tf.matmul(x, weights), bias) - - if relu: - output = tf.minimum(tf.nn.relu(output), Config.relu_clip) - - if layer_norm: - with tfv1.variable_scope(name): - output = tf.contrib.layers.layer_norm(output) - - if dropout_rate is not None: - output = tf.nn.dropout(output, rate=dropout_rate) - - return output - - -def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse): - with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"): - fw_cell = tf.contrib.rnn.LSTMBlockFusedCell( - Config.n_cell_dim, - forget_bias=0, - reuse=reuse, - name="cudnn_compatible_lstm_cell", - ) - - output, output_state = fw_cell( - inputs=x, - dtype=tf.float32, - sequence_length=seq_length, - initial_state=previous_state, - ) - - return output, output_state - - -def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _): - assert ( - previous_state is None - ) # 'Passing previous state not supported with CuDNN backend' - - # Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate - # the object it creates the variables, and then you just call it several times - # to enable variable re-use. Because all of our code is structure in an old - # school TensorFlow structure where you can just call tf.get_variable again with - # reuse=True to reuse variables, we can't easily make use of the object oriented - # way CudnnLSTM is implemented, so we save a singleton instance in the function, - # emulating a static function variable. - if not rnn_impl_cudnn_rnn.cell: - # Forward direction cell: - fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM( - num_layers=1, - num_units=Config.n_cell_dim, - input_mode="linear_input", - direction="unidirectional", - dtype=tf.float32, - ) - rnn_impl_cudnn_rnn.cell = fw_cell - - output, output_state = rnn_impl_cudnn_rnn.cell( - inputs=x, sequence_lengths=seq_length - ) - - return output, output_state - - -rnn_impl_cudnn_rnn.cell = None - - -def rnn_impl_static_rnn(x, seq_length, previous_state, reuse): - with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"): - # Forward direction cell: - fw_cell = tfv1.nn.rnn_cell.LSTMCell( - Config.n_cell_dim, - forget_bias=0, - reuse=reuse, - name="cudnn_compatible_lstm_cell", - ) - - # Split rank N tensor into list of rank N-1 tensors - x = [x[l] for l in range(x.shape[0])] - - output, output_state = tfv1.nn.static_rnn( - cell=fw_cell, - inputs=x, - sequence_length=seq_length, - initial_state=previous_state, - dtype=tf.float32, - scope="cell_0", - ) - - output = tf.concat(output, 0) - - return output, output_state - - -def create_model( - batch_x, - seq_length, - dropout, - reuse=False, - batch_size=None, - previous_state=None, - overlap=True, - rnn_impl=rnn_impl_lstmblockfusedcell, -): - layers = {} - - # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context] - if not batch_size: - batch_size = tf.shape(input=batch_x)[0] - - # Create overlapping feature windows if needed - if overlap: - batch_x = create_overlapping_windows(batch_x) - - # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`. - # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`. - - # Permute n_steps and batch_size - batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3]) - # Reshape to prepare input for first layer - batch_x = tf.reshape( - batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context] - ) # (n_steps*batch_size, n_input + 2*n_input*n_context) - layers["input_reshaped"] = batch_x - - # The next three blocks will pass `batch_x` through three hidden layers with - # clipped RELU activation and dropout. - layers["layer_1"] = layer_1 = dense( - "layer_1", - batch_x, - Config.n_hidden_1, - dropout_rate=dropout[0], - layer_norm=Config.layer_norm, - ) - layers["layer_2"] = layer_2 = dense( - "layer_2", - layer_1, - Config.n_hidden_2, - dropout_rate=dropout[1], - layer_norm=Config.layer_norm, - ) - layers["layer_3"] = layer_3 = dense( - "layer_3", - layer_2, - Config.n_hidden_3, - dropout_rate=dropout[2], - layer_norm=Config.layer_norm, - ) - - # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`, - # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`. - layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3]) - - # Run through parametrized RNN implementation, as we use different RNNs - # for training and inference - output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse) - - # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim] - # to a tensor of shape [n_steps*batch_size, n_cell_dim] - output = tf.reshape(output, [-1, Config.n_cell_dim]) - layers["rnn_output"] = output - layers["rnn_output_state"] = output_state - - # Now we feed `output` to the fifth hidden layer with clipped RELU activation - layers["layer_5"] = layer_5 = dense( - "layer_5", - output, - Config.n_hidden_5, - dropout_rate=dropout[5], - layer_norm=Config.layer_norm, - ) - - # Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits. - layers["layer_6"] = layer_6 = dense( - "layer_6", layer_5, Config.n_hidden_6, relu=False - ) - - # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6] - # to the slightly more useful shape [n_steps, batch_size, n_hidden_6]. - # Note, that this differs from the input in that it is time-major. - layer_6 = tf.reshape( - layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits" - ) - layers["raw_logits"] = layer_6 - - # Output shape: [n_steps, batch_size, n_hidden_6] - return layer_6, layers - - # Accuracy and Loss # ================= @@ -900,371 +660,6 @@ def train(): log_debug("Session closed.") -def test(): - tfv1.reset_default_graph() - - samples = evaluate(Config.test_files, create_model) - if Config.test_output_file: - save_samples_json(samples, Config.test_output_file) - - -def create_inference_graph(batch_size=1, n_steps=16, tflite=False): - batch_size = batch_size if batch_size > 0 else None - - # Create feature computation graph - - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - input_samples = tfv1.placeholder( - tf.float32, [Config.audio_window_samples], "input_samples" - ) - samples = tf.expand_dims(input_samples, -1) - mfccs, _ = audio_to_features(samples, Config.audio_sample_rate) - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - mfccs = tf.identity(mfccs, name="mfccs") - - # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input] - # This shape is read by the native_client in STT_CreateModel to know the - # value of n_steps, n_context and n_input. Make sure you update the code - # there if this shape is changed. - # - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - input_tensor = tfv1.placeholder( - tf.float32, - [ - batch_size, - n_steps if n_steps > 0 else None, - 2 * Config.n_context + 1, - Config.n_input, - ], - name="input_node", - ) - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths") - - if batch_size <= 0: - # no state management since n_step is expected to be dynamic too (see below) - previous_state = None - else: - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - previous_state_c = tfv1.placeholder( - tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c" - ) - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - previous_state_h = tfv1.placeholder( - tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h" - ) - - previous_state = tf.nn.rnn_cell.LSTMStateTuple( - previous_state_c, previous_state_h - ) - - # One rate per layer - no_dropout = [None] * 6 - - if tflite: - rnn_impl = rnn_impl_static_rnn - else: - rnn_impl = rnn_impl_lstmblockfusedcell - - logits, layers = create_model( - batch_x=input_tensor, - batch_size=batch_size, - seq_length=seq_length if not Config.export_tflite else None, - dropout=no_dropout, - previous_state=previous_state, - overlap=False, - rnn_impl=rnn_impl, - ) - - # TF Lite runtime will check that input dimensions are 1, 2 or 4 - # by default we get 3, the middle one being batch_size which is forced to - # one on inference graph, so remove that dimension - # - # native_client: this node's name and shape are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - if tflite: - logits = tf.squeeze(logits, [1]) - - # Apply softmax for CTC decoder - probs = tf.nn.softmax(logits, name="logits") - - if batch_size <= 0: - if tflite: - raise NotImplementedError( - "dynamic batch_size does not support tflite nor streaming" - ) - if n_steps > 0: - raise NotImplementedError( - "dynamic batch_size expect n_steps to be dynamic too" - ) - return ( - { - "input": input_tensor, - "input_lengths": seq_length, - }, - { - "outputs": probs, - }, - layers, - ) - - new_state_c, new_state_h = layers["rnn_output_state"] - new_state_c = tf.identity(new_state_c, name="new_state_c") - new_state_h = tf.identity(new_state_h, name="new_state_h") - - inputs = { - "input": input_tensor, - "previous_state_c": previous_state_c, - "previous_state_h": previous_state_h, - "input_samples": input_samples, - } - - if not Config.export_tflite: - inputs["input_lengths"] = seq_length - - outputs = { - "outputs": probs, - "new_state_c": new_state_c, - "new_state_h": new_state_h, - "mfccs": mfccs, - # Expose internal layers for downstream applications - "layer_3": layers["layer_3"], - "layer_5": layers["layer_5"], - } - - return inputs, outputs, layers - - -def file_relative_read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() - - -def export(): - r""" - Restores the trained variables into a simpler graph that will be exported for serving. - """ - log_info("Exporting the model...") - - tfv1.reset_default_graph() - - inputs, outputs, _ = create_inference_graph( - batch_size=Config.export_batch_size, - n_steps=Config.n_steps, - tflite=Config.export_tflite, - ) - - graph_version = int(file_relative_read("GRAPH_VERSION").strip()) - assert graph_version > 0 - - # native_client: these nodes's names and shapes are part of the API boundary - # with the native client, if you change them you should sync changes with - # the C++ code. - outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version") - outputs["metadata_sample_rate"] = tf.constant( - [Config.audio_sample_rate], name="metadata_sample_rate" - ) - outputs["metadata_feature_win_len"] = tf.constant( - [Config.feature_win_len], name="metadata_feature_win_len" - ) - outputs["metadata_feature_win_step"] = tf.constant( - [Config.feature_win_step], name="metadata_feature_win_step" - ) - outputs["metadata_beam_width"] = tf.constant( - [Config.export_beam_width], name="metadata_beam_width" - ) - outputs["metadata_alphabet"] = tf.constant( - [Config.alphabet.Serialize()], name="metadata_alphabet" - ) - - if Config.export_language: - outputs["metadata_language"] = tf.constant( - [Config.export_language.encode("utf-8")], name="metadata_language" - ) - - # Prevent further graph changes - tfv1.get_default_graph().finalize() - - output_names_tensors = [ - tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor) - ] - output_names_ops = [ - op.name for op in outputs.values() if isinstance(op, tf.Operation) - ] - output_names = output_names_tensors + output_names_ops - - with tf.Session() as session: - # Restore variables from checkpoint - load_graph_for_evaluation(session) - - output_filename = Config.export_file_name + ".pb" - if Config.remove_export: - if isdir_remote(Config.export_dir): - log_info("Removing old export") - remove_remote(Config.export_dir) - - output_graph_path = os.path.join(Config.export_dir, output_filename) - - if not is_remote_path(Config.export_dir) and not os.path.isdir( - Config.export_dir - ): - os.makedirs(Config.export_dir) - - frozen_graph = tfv1.graph_util.convert_variables_to_constants( - sess=session, - input_graph_def=tfv1.get_default_graph().as_graph_def(), - output_node_names=output_names, - ) - - frozen_graph = tfv1.graph_util.extract_sub_graph( - graph_def=frozen_graph, dest_nodes=output_names - ) - - if not Config.export_tflite: - with open_remote(output_graph_path, "wb") as fout: - fout.write(frozen_graph.SerializeToString()) - else: - output_tflite_path = os.path.join( - Config.export_dir, output_filename.replace(".pb", ".tflite") - ) - - converter = tf.lite.TFLiteConverter( - frozen_graph, - input_tensors=inputs.values(), - output_tensors=outputs.values(), - ) - - if Config.export_quantize: - converter.optimizations = [tf.lite.Optimize.DEFAULT] - - # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite - converter.allow_custom_ops = True - tflite_model = converter.convert() - - with open_remote(output_tflite_path, "wb") as fout: - fout.write(tflite_model) - - log_info("Models exported at %s" % (Config.export_dir)) - - metadata_fname = os.path.join( - Config.export_dir, - "{}_{}_{}.md".format( - Config.export_author_id, - Config.export_model_name, - Config.export_model_version, - ), - ) - - model_runtime = "tflite" if Config.export_tflite else "tensorflow" - with open_remote(metadata_fname, "w") as f: - f.write("---\n") - f.write("author: {}\n".format(Config.export_author_id)) - f.write("model_name: {}\n".format(Config.export_model_name)) - f.write("model_version: {}\n".format(Config.export_model_version)) - f.write("contact_info: {}\n".format(Config.export_contact_info)) - f.write("license: {}\n".format(Config.export_license)) - f.write("language: {}\n".format(Config.export_language)) - f.write("runtime: {}\n".format(model_runtime)) - f.write("min_stt_version: {}\n".format(Config.export_min_stt_version)) - f.write("max_stt_version: {}\n".format(Config.export_max_stt_version)) - f.write( - "acoustic_model_url: \n" - ) - f.write( - "scorer_url: \n" - ) - f.write("---\n") - f.write("{}\n".format(Config.export_description)) - - log_info( - "Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format( - metadata_fname - ) - ) - - -def package_zip(): - # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip - export_dir = os.path.join( - os.path.abspath(Config.export_dir), "" - ) # Force ending '/' - if is_remote_path(export_dir): - log_error( - "Cannot package remote path zip %s. Please do this manually." % export_dir - ) - return - - zip_filename = os.path.dirname(export_dir) - - shutil.copy(Config.scorer_path, export_dir) - - archive = shutil.make_archive(zip_filename, "zip", export_dir) - log_info("Exported packaged model {}".format(archive)) - - -def do_single_file_inference(input_file_path): - tfv1.reset_default_graph() - - with tfv1.Session(config=Config.session_config) as session: - inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1) - - # Restore variables from training checkpoint - load_graph_for_evaluation(session) - - features, features_len = audiofile_to_features(input_file_path) - previous_state_c = np.zeros([1, Config.n_cell_dim]) - previous_state_h = np.zeros([1, Config.n_cell_dim]) - - # Add batch dimension - features = tf.expand_dims(features, 0) - features_len = tf.expand_dims(features_len, 0) - - # Evaluate - features = create_overlapping_windows(features).eval(session=session) - features_len = features_len.eval(session=session) - - probs = outputs["outputs"].eval( - feed_dict={ - inputs["input"]: features, - inputs["input_lengths"]: features_len, - inputs["previous_state_c"]: previous_state_c, - inputs["previous_state_h"]: previous_state_h, - }, - session=session, - ) - - probs = np.squeeze(probs) - - if Config.scorer_path: - scorer = Scorer( - Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet - ) - else: - scorer = None - decoded = ctc_beam_search_decoder( - probs, - Config.alphabet, - Config.beam_width, - scorer=scorer, - cutoff_prob=Config.cutoff_prob, - cutoff_top_n=Config.cutoff_top_n, - ) - # Print highest probability result - print(decoded[0][1]) - - def early_training_checks(): # Check for proper scorer early if Config.scorer_path: @@ -1289,36 +684,47 @@ def early_training_checks(): ) if not Config.alphabet_config_path and not Config.bytes_output_mode: - log_error("Missing --alphabet_config_path flag, can't continue") - sys.exit(1) + raise RuntimeError("Missing --alphabet_config_path flag, can't continue") def main(): initialize_globals_from_cli() + def deprecated_msg(prefix): + return ( + f"{prefix} Using the training script as a generic driver for all training " + "related functionality is deprecated and will be removed soon. Use " + "the specific scripts: train.py/evaluate.py/export.py/training_graph_inference.py." + ) + if Config.train_files: train() + else: + log_warn(deprecated_msg("Calling training script without --train_files.")) if Config.test_files: - test() - - if Config.export_dir and not Config.export_zip: - export() - - if Config.export_zip: - Config.export_tflite = True - - if listdir_remote(Config.export_dir): - log_error( - "Directory {} is not empty, please fix this.".format(Config.export_dir) + log_warn( + deprecated_msg( + "Specifying --test_files when calling train.py script. Use evaluate.py." ) - sys.exit(1) + ) + evaluate.test() - export() - package_zip() + if Config.export_dir: + log_warn( + deprecated_msg( + "Specifying --export_dir when calling train.py script. Use export.py." + ) + ) + export.export() if Config.one_shot_infer: - do_single_file_inference(Config.one_shot_infer) + log_warn( + deprecated_msg( + "Specifying --one_shot_infer when calling train.py script. Use training_graph_inference.py." + ) + ) + traning_graph_inference.do_single_file_inference(Config.one_shot_infer) if __name__ == "__main__": diff --git a/training/coqui_stt_training/training_graph_inference.py b/training/coqui_stt_training/training_graph_inference.py new file mode 100644 index 00000000..b5399a91 --- /dev/null +++ b/training/coqui_stt_training/training_graph_inference.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import sys + +LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0 +DESIRED_LOG_LEVEL = ( + sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3" +) +os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL + +import numpy as np +import tensorflow as tf +import tensorflow.compat.v1 as tfv1 + +from coqui_stt_ctcdecoder import ctc_beam_search_decoder, Scorer +from .deepspeech_model import create_inference_graph, create_overlapping_windows +from .util.checkpoints import load_graph_for_evaluation +from .util.config import Config, initialize_globals_from_cli, log_error +from .util.feeding import audiofile_to_features + + +def do_single_file_inference(input_file_path): + tfv1.reset_default_graph() + + with tfv1.Session(config=Config.session_config) as session: + inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1) + + # Restore variables from training checkpoint + load_graph_for_evaluation(session) + + features, features_len = audiofile_to_features(input_file_path) + previous_state_c = np.zeros([1, Config.n_cell_dim]) + previous_state_h = np.zeros([1, Config.n_cell_dim]) + + # Add batch dimension + features = tf.expand_dims(features, 0) + features_len = tf.expand_dims(features_len, 0) + + # Evaluate + features = create_overlapping_windows(features).eval(session=session) + features_len = features_len.eval(session=session) + + probs = outputs["outputs"].eval( + feed_dict={ + inputs["input"]: features, + inputs["input_lengths"]: features_len, + inputs["previous_state_c"]: previous_state_c, + inputs["previous_state_h"]: previous_state_h, + }, + session=session, + ) + + probs = np.squeeze(probs) + + if Config.scorer_path: + scorer = Scorer( + Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet + ) + else: + scorer = None + decoded = ctc_beam_search_decoder( + probs, + Config.alphabet, + Config.beam_width, + scorer=scorer, + cutoff_prob=Config.cutoff_prob, + cutoff_top_n=Config.cutoff_top_n, + ) + # Print highest probability result + print(decoded[0][1]) + + +def main(): + initialize_globals_from_cli() + + if Config.one_shot_infer: + tfv1.reset_default_graph() + do_single_file_inference(Config.one_shot_infer) + else: + raise RuntimeError( + "Calling training_graph_inference script directly but no --one_shot_infer input audio file specified" + ) + + +if __name__ == "__main__": + main() diff --git a/training/coqui_stt_training/util/config.py b/training/coqui_stt_training/util/config.py index 34794f1b..96114e25 100755 --- a/training/coqui_stt_training/util/config.py +++ b/training/coqui_stt_training/util/config.py @@ -477,7 +477,7 @@ class _SttConfig(Coqpit): default=False, metadata=dict(help="whether to remove old exported models") ) export_tflite: bool = field( - default=False, metadata=dict(help="export a graph ready for TF Lite engine") + default=True, metadata=dict(help="export a graph ready for TF Lite engine") ) export_quantize: bool = field( default=True, diff --git a/training/coqui_stt_training/util/io.py b/training/coqui_stt_training/util/io.py index a3fb3368..6d466631 100644 --- a/training/coqui_stt_training/util/io.py +++ b/training/coqui_stt_training/util/io.py @@ -90,3 +90,10 @@ def remove_remote(filename): """ # Conditional import return gfile.remove(filename) + + +def rmtree_remote(foldername): + """ + Wrapper that can remove local and remote directories like `gs://...` + """ + return gfile.rmtree(foldername)