Merge pull request #1946 from coqui-ai/training-submodules

Split train.py into separate modules
2021-08-25 19:37:53 +02:00 · 2021-08-25 19:37:53 +02:00 · 5afe3c6e59
parent 71da178138 2fd98de56f
commit 5afe3c6e59
12 changed files with 780 additions and 657 deletions
--- a/.github/actions/numpy_vers/action.yml
+++ b/.github/actions/numpy_vers/action.yml
@ -28,15 +28,15 @@ runs:
                case "${{ inputs.pyver }}" in
                    3.7*)
                        NUMPY_BUILD_VERSION="==1.14.5"
-                        NUMPY_DEP_VERSION=">=1.14.5"
+                        NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4"
                    ;;
                    3.8*)
                        NUMPY_BUILD_VERSION="==1.17.3"
-                        NUMPY_DEP_VERSION=">=1.17.3"
+                        NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4"
                    ;;
                    3.9*)
                        NUMPY_BUILD_VERSION="==1.19.4"
-                        NUMPY_DEP_VERSION=">=1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
                    ;;
                esac
            ;;
@ -57,7 +57,7 @@ runs:
                    ;;
                    3.9*)
                        NUMPY_BUILD_VERSION="==1.19.4"
-                        NUMPY_DEP_VERSION=">=1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
                    ;;
                esac
            ;;
@ -82,7 +82,7 @@ runs:
                    ;;
                    3.9*)
                        NUMPY_BUILD_VERSION="==1.19.4"
-                        NUMPY_DEP_VERSION=">=1.19.4"
+                        NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
                    ;;
                esac
            ;;
--- a/bin/run-ci-ldc93s1_new.sh
+++ b/bin/run-ci-ldc93s1_new.sh
@ -27,4 +27,5 @@ python -u train.py --alphabet_config_path "data/alphabet.txt" \
  --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
  --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train' \
  --scorer_path 'data/smoke_test/pruned_lm.scorer' \
-  --audio_sample_rate ${audio_sample_rate}
+  --audio_sample_rate ${audio_sample_rate} \
  --export_tflite false
--- a/bin/run-ci-ldc93s1_new_bytes.sh
+++ b/bin/run-ci-ldc93s1_new_bytes.sh
@ -27,4 +27,5 @@ python -u train.py --show_progressbar false --early_stop false \
  --learning_rate 0.001 --dropout_rate 0.05  --export_dir '/tmp/train_bytes' \
  --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
  --audio_sample_rate ${audio_sample_rate} \
-  --bytes_output_mode true
+  --bytes_output_mode true \
  --export_tflite false
--- a/bin/run-ldc93s1.py
+++ b/bin/run-ldc93s1.py
@ -2,7 +2,8 @@
 import os
 from import_ldc93s1 import _download_and_preprocess_data as download_ldc
 from coqui_stt_training.util.config import initialize_globals_from_args
-from coqui_stt_training.train import train, test
+from coqui_stt_training.train import train
 from coqui_stt_training.evaluate import test
 # only one GPU for only one training sample
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@ -21,5 +22,4 @@ initialize_globals_from_args(
 )
 train()
 test()
--- a/setup.py
+++ b/setup.py
@ -18,6 +18,7 @@ def main():
        "coqpit",
        "numpy",
        "optuna",
        "numba <= 0.53.1",
        "opuslib == 2.0.0",
        "pandas",
        "progressbar2",
--- a/training/coqui_stt_training/deepspeech_model.py
+++ b/training/coqui_stt_training/deepspeech_model.py
@ -0,0 +1,403 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import sys
 LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
 DESIRED_LOG_LEVEL = (
    sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
 )
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
 import numpy as np
 import tensorflow as tf
 import tensorflow.compat.v1 as tfv1
 tfv1.logging.set_verbosity(
    {
        "0": tfv1.logging.DEBUG,
        "1": tfv1.logging.INFO,
        "2": tfv1.logging.WARN,
        "3": tfv1.logging.ERROR,
    }.get(DESIRED_LOG_LEVEL)
 )
 from .util.config import Config
 from .util.feeding import audio_to_features
 def variable_on_cpu(name, shape, initializer):
    r"""
    Next we concern ourselves with graph creation.
    However, before we do so we must introduce a utility function ``variable_on_cpu()``
    used to create a variable in CPU memory.
    """
    # Use the /cpu:0 device for scoped operations
    with tf.device(Config.cpu_device):
        # Create or get apropos variable
        var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
    return var
 def create_overlapping_windows(batch_x):
    batch_size = tf.shape(input=batch_x)[0]
    window_width = 2 * Config.n_context + 1
    num_channels = Config.n_input
    # Create a constant convolution filter using an identity matrix, so that the
    # convolution returns patches of the input tensor as is, and we can create
    # overlapping windows over the MFCCs.
    eye_filter = tf.constant(
        np.eye(window_width * num_channels).reshape(
            window_width, num_channels, window_width * num_channels
        ),
        tf.float32,
    )  # pylint: disable=bad-continuation
    # Create overlapping windows
    batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
    # Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
    batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
    return batch_x
 def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
    with tfv1.variable_scope(name):
        bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
        weights = variable_on_cpu(
            "weights",
            [x.shape[-1], units],
            tfv1.keras.initializers.VarianceScaling(
                scale=1.0, mode="fan_avg", distribution="uniform"
            ),
        )
    output = tf.nn.bias_add(tf.matmul(x, weights), bias)
    if relu:
        output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
    if layer_norm:
        with tfv1.variable_scope(name):
            output = tf.contrib.layers.layer_norm(output)
    if dropout_rate is not None:
        output = tf.nn.dropout(output, rate=dropout_rate)
    return output
 def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
        fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )
        output, output_state = fw_cell(
            inputs=x,
            dtype=tf.float32,
            sequence_length=seq_length,
            initial_state=previous_state,
        )
    return output, output_state
 def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
    assert (
        previous_state is None
    )  # 'Passing previous state not supported with CuDNN backend'
    # Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
    # the object it creates the variables, and then you just call it several times
    # to enable variable re-use. Because all of our code is structure in an old
    # school TensorFlow structure where you can just call tf.get_variable again with
    # reuse=True to reuse variables, we can't easily make use of the object oriented
    # way CudnnLSTM is implemented, so we save a singleton instance in the function,
    # emulating a static function variable.
    if not rnn_impl_cudnn_rnn.cell:
        # Forward direction cell:
        fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
            num_layers=1,
            num_units=Config.n_cell_dim,
            input_mode="linear_input",
            direction="unidirectional",
            dtype=tf.float32,
        )
        rnn_impl_cudnn_rnn.cell = fw_cell
    output, output_state = rnn_impl_cudnn_rnn.cell(
        inputs=x, sequence_lengths=seq_length
    )
    return output, output_state
 rnn_impl_cudnn_rnn.cell = None
 def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
        # Forward direction cell:
        fw_cell = tfv1.nn.rnn_cell.LSTMCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )
        # Split rank N tensor into list of rank N-1 tensors
        x = [x[l] for l in range(x.shape[0])]
        output, output_state = tfv1.nn.static_rnn(
            cell=fw_cell,
            inputs=x,
            sequence_length=seq_length,
            initial_state=previous_state,
            dtype=tf.float32,
            scope="cell_0",
        )
        output = tf.concat(output, 0)
    return output, output_state
 def create_model(
    batch_x,
    seq_length,
    dropout,
    reuse=False,
    batch_size=None,
    previous_state=None,
    overlap=True,
    rnn_impl=rnn_impl_lstmblockfusedcell,
 ):
    layers = {}
    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
    if not batch_size:
        batch_size = tf.shape(input=batch_x)[0]
    # Create overlapping feature windows if needed
    if overlap:
        batch_x = create_overlapping_windows(batch_x)
    # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
    # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
    # Permute n_steps and batch_size
    batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
    # Reshape to prepare input for first layer
    batch_x = tf.reshape(
        batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
    )  # (n_steps*batch_size, n_input + 2*n_input*n_context)
    layers["input_reshaped"] = batch_x
    # The next three blocks will pass `batch_x` through three hidden layers with
    # clipped RELU activation and dropout.
    layers["layer_1"] = layer_1 = dense(
        "layer_1",
        batch_x,
        Config.n_hidden_1,
        dropout_rate=dropout[0],
        layer_norm=Config.layer_norm,
    )
    layers["layer_2"] = layer_2 = dense(
        "layer_2",
        layer_1,
        Config.n_hidden_2,
        dropout_rate=dropout[1],
        layer_norm=Config.layer_norm,
    )
    layers["layer_3"] = layer_3 = dense(
        "layer_3",
        layer_2,
        Config.n_hidden_3,
        dropout_rate=dropout[2],
        layer_norm=Config.layer_norm,
    )
    # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
    # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
    layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
    # Run through parametrized RNN implementation, as we use different RNNs
    # for training and inference
    output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
    # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
    # to a tensor of shape [n_steps*batch_size, n_cell_dim]
    output = tf.reshape(output, [-1, Config.n_cell_dim])
    layers["rnn_output"] = output
    layers["rnn_output_state"] = output_state
    # Now we feed `output` to the fifth hidden layer with clipped RELU activation
    layers["layer_5"] = layer_5 = dense(
        "layer_5",
        output,
        Config.n_hidden_5,
        dropout_rate=dropout[5],
        layer_norm=Config.layer_norm,
    )
    # Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
    layers["layer_6"] = layer_6 = dense(
        "layer_6", layer_5, Config.n_hidden_6, relu=False
    )
    # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
    # to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
    # Note, that this differs from the input in that it is time-major.
    layer_6 = tf.reshape(
        layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
    )
    layers["raw_logits"] = layer_6
    # Output shape: [n_steps, batch_size, n_hidden_6]
    return layer_6, layers
 def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
    batch_size = batch_size if batch_size > 0 else None
    # Create feature computation graph
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    input_samples = tfv1.placeholder(
        tf.float32, [Config.audio_window_samples], "input_samples"
    )
    samples = tf.expand_dims(input_samples, -1)
    mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    mfccs = tf.identity(mfccs, name="mfccs")
    # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
    # This shape is read by the native_client in STT_CreateModel to know the
    # value of n_steps, n_context and n_input. Make sure you update the code
    # there if this shape is changed.
    #
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    input_tensor = tfv1.placeholder(
        tf.float32,
        [
            batch_size,
            n_steps if n_steps > 0 else None,
            2 * Config.n_context + 1,
            Config.n_input,
        ],
        name="input_node",
    )
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
    if batch_size <= 0:
        # no state management since n_step is expected to be dynamic too (see below)
        previous_state = None
    else:
        # native_client: this node's name and shape are part of the API boundary
        #   with the native client, if you change them you should sync changes with
        #   the C++ code.
        previous_state_c = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
        )
        # native_client: this node's name and shape are part of the API boundary
        #   with the native client, if you change them you should sync changes with
        #   the C++ code.
        previous_state_h = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
        )
        previous_state = tf.nn.rnn_cell.LSTMStateTuple(
            previous_state_c, previous_state_h
        )
    # One rate per layer
    no_dropout = [None] * 6
    if tflite:
        rnn_impl = rnn_impl_static_rnn
    else:
        rnn_impl = rnn_impl_lstmblockfusedcell
    logits, layers = create_model(
        batch_x=input_tensor,
        batch_size=batch_size,
        seq_length=seq_length if not Config.export_tflite else None,
        dropout=no_dropout,
        previous_state=previous_state,
        overlap=False,
        rnn_impl=rnn_impl,
    )
    # TF Lite runtime will check that input dimensions are 1, 2 or 4
    # by default we get 3, the middle one being batch_size which is forced to
    # one on inference graph, so remove that dimension
    #
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    if tflite:
        logits = tf.squeeze(logits, [1])
    # Apply softmax for CTC decoder
    probs = tf.nn.softmax(logits, name="logits")
    if batch_size <= 0:
        if tflite:
            raise NotImplementedError(
                "dynamic batch_size does not support tflite nor streaming"
            )
        if n_steps > 0:
            raise NotImplementedError(
                "dynamic batch_size expect n_steps to be dynamic too"
            )
        return (
            {
                "input": input_tensor,
                "input_lengths": seq_length,
            },
            {
                "outputs": probs,
            },
            layers,
        )
    new_state_c, new_state_h = layers["rnn_output_state"]
    new_state_c = tf.identity(new_state_c, name="new_state_c")
    new_state_h = tf.identity(new_state_h, name="new_state_h")
    inputs = {
        "input": input_tensor,
        "previous_state_c": previous_state_c,
        "previous_state_h": previous_state_h,
        "input_samples": input_samples,
    }
    if not Config.export_tflite:
        inputs["input_lengths"] = seq_length
    outputs = {
        "outputs": probs,
        "new_state_c": new_state_c,
        "new_state_h": new_state_h,
        "mfccs": mfccs,
        # Expose internal layers for downstream applications
        "layer_3": layers["layer_3"],
        "layer_5": layers["layer_5"],
    }
    return inputs, outputs, layers
--- a/training/coqui_stt_training/evaluate.py
+++ b/training/coqui_stt_training/evaluate.py
@ -13,6 +13,7 @@ from six.moves import zip
 import tensorflow as tf
 from .deepspeech_model import create_model
 from .util.augmentations import NormalizeSampleRate
 from .util.checkpoints import load_graph_for_evaluation
 from .util.config import (
@ -168,25 +169,25 @@ def evaluate(test_csvs, create_model):
        return samples
-def main():
+def test():
-    initialize_globals_from_cli()
+    tfv1.reset_default_graph()
    if not Config.test_files:
        log_error(
            "You need to specify what files to use for evaluation via "
            "the --test_files flag."
        )
        sys.exit(1)
    from .train import (  # pylint: disable=cyclic-import,import-outside-toplevel
        create_model,
    )
    samples = evaluate(Config.test_files, create_model)
    if Config.test_output_file:
        save_samples_json(samples, Config.test_output_file)
 def main():
    initialize_globals_from_cli()
    if not Config.test_files:
        raise RuntimeError(
            "You need to specify what files to use for evaluation via "
            "the --test_files flag."
        )
    test()
 if __name__ == "__main__":
    main()
--- a/training/coqui_stt_training/export.py
+++ b/training/coqui_stt_training/export.py
@ -0,0 +1,216 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import sys
 LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
 DESIRED_LOG_LEVEL = (
    sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
 )
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
 import tensorflow as tf
 import tensorflow.compat.v1 as tfv1
 import shutil
 from .deepspeech_model import create_inference_graph
 from .util.checkpoints import load_graph_for_evaluation
 from .util.config import Config, initialize_globals_from_cli, log_error, log_info
 from .util.io import (
    open_remote,
    rmtree_remote,
    listdir_remote,
    is_remote_path,
    isdir_remote,
 )
 def file_relative_read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()
 def export():
    r"""
    Restores the trained variables into a simpler graph that will be exported for serving.
    """
    log_info("Exporting the model...")
    tfv1.reset_default_graph()
    inputs, outputs, _ = create_inference_graph(
        batch_size=Config.export_batch_size,
        n_steps=Config.n_steps,
        tflite=Config.export_tflite,
    )
    graph_version = int(file_relative_read("GRAPH_VERSION").strip())
    assert graph_version > 0
    # native_client: these nodes's names and shapes are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
    outputs["metadata_sample_rate"] = tf.constant(
        [Config.audio_sample_rate], name="metadata_sample_rate"
    )
    outputs["metadata_feature_win_len"] = tf.constant(
        [Config.feature_win_len], name="metadata_feature_win_len"
    )
    outputs["metadata_feature_win_step"] = tf.constant(
        [Config.feature_win_step], name="metadata_feature_win_step"
    )
    outputs["metadata_beam_width"] = tf.constant(
        [Config.export_beam_width], name="metadata_beam_width"
    )
    outputs["metadata_alphabet"] = tf.constant(
        [Config.alphabet.Serialize()], name="metadata_alphabet"
    )
    if Config.export_language:
        outputs["metadata_language"] = tf.constant(
            [Config.export_language.encode("utf-8")], name="metadata_language"
        )
    # Prevent further graph changes
    tfv1.get_default_graph().finalize()
    output_names_tensors = [
        tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
    ]
    output_names_ops = [
        op.name for op in outputs.values() if isinstance(op, tf.Operation)
    ]
    output_names = output_names_tensors + output_names_ops
    with tf.Session() as session:
        # Restore variables from checkpoint
        load_graph_for_evaluation(session)
        output_filename = Config.export_file_name + ".pb"
        if Config.remove_export:
            if isdir_remote(Config.export_dir):
                log_info("Removing old export")
                rmtree_remote(Config.export_dir)
        output_graph_path = os.path.join(Config.export_dir, output_filename)
        if not is_remote_path(Config.export_dir) and not os.path.isdir(
            Config.export_dir
        ):
            os.makedirs(Config.export_dir)
        frozen_graph = tfv1.graph_util.convert_variables_to_constants(
            sess=session,
            input_graph_def=tfv1.get_default_graph().as_graph_def(),
            output_node_names=output_names,
        )
        frozen_graph = tfv1.graph_util.extract_sub_graph(
            graph_def=frozen_graph, dest_nodes=output_names
        )
        if not Config.export_tflite:
            with open_remote(output_graph_path, "wb") as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(
                Config.export_dir, output_filename.replace(".pb", ".tflite")
            )
            converter = tf.lite.TFLiteConverter(
                frozen_graph,
                input_tensors=inputs.values(),
                output_tensors=outputs.values(),
            )
            if Config.export_quantize:
                converter.optimizations = [tf.lite.Optimize.DEFAULT]
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()
            with open_remote(output_tflite_path, "wb") as fout:
                fout.write(tflite_model)
        log_info("Models exported at %s" % (Config.export_dir))
    metadata_fname = os.path.join(
        Config.export_dir,
        "{}_{}_{}.md".format(
            Config.export_author_id,
            Config.export_model_name,
            Config.export_model_version,
        ),
    )
    model_runtime = "tflite" if Config.export_tflite else "tensorflow"
    with open_remote(metadata_fname, "w") as f:
        f.write("---\n")
        f.write("author: {}\n".format(Config.export_author_id))
        f.write("model_name: {}\n".format(Config.export_model_name))
        f.write("model_version: {}\n".format(Config.export_model_version))
        f.write("contact_info: {}\n".format(Config.export_contact_info))
        f.write("license: {}\n".format(Config.export_license))
        f.write("language: {}\n".format(Config.export_language))
        f.write("runtime: {}\n".format(model_runtime))
        f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
        f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
        f.write(
            "acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
        )
        f.write(
            "scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
        )
        f.write("---\n")
        f.write("{}\n".format(Config.export_description))
    log_info(
        "Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
            metadata_fname
        )
    )
 def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(
        os.path.abspath(Config.export_dir), ""
    )  # Force ending '/'
    if is_remote_path(export_dir):
        log_error(
            "Cannot package remote path zip %s. Please do this manually." % export_dir
        )
        return
    zip_filename = os.path.dirname(export_dir)
    shutil.copy(Config.scorer_path, export_dir)
    archive = shutil.make_archive(zip_filename, "zip", export_dir)
    log_info("Exported packaged model {}".format(archive))
 def main(_):
    initialize_globals_from_cli()
    if not Config.export_dir:
        raise RuntimeError(
            "Calling export script directly but no --export_dir specified"
        )
    if not Config.export_zip:
        # Export to folder
        export()
    else:
        if listdir_remote(Config.export_dir):
            raise RuntimeError(
                "Directory {} is not empty, please fix this.".format(Config.export_dir)
            )
        export()
        package_zip()
 if __name__ == "__main__":
    main()
--- a/training/coqui_stt_training/train.py
+++ b/training/coqui_stt_training/train.py
@ -14,12 +14,13 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
 import json
 import shutil
 import time
 from datetime import datetime
 import numpy as np
 import progressbar
 import tensorflow.compat.v1 as tfv1
 import tensorflow as tf
 from coqui_stt_ctcdecoder import Scorer
 tfv1.logging.set_verbosity(
    {
@ -30,12 +31,15 @@ tfv1.logging.set_verbosity(
    }.get(DESIRED_LOG_LEVEL)
 )
 from datetime import datetime
-from coqui_stt_ctcdecoder import Scorer, ctc_beam_search_decoder
+from . import evaluate
-from six.moves import range, zip
+from . import export
-
+from . import training_graph_inference
-from .evaluate import evaluate
+from .deepspeech_model import (
    create_model,
    rnn_impl_lstmblockfusedcell,
    rnn_impl_cudnn_rnn,
 )
 from .util.augmentations import NormalizeSampleRate
 from .util.checkpoints import (
    load_graph_for_evaluation,
@ -52,260 +56,16 @@ from .util.config import (
    log_progress,
    log_warn,
 )
-from .util.evaluate_tools import save_samples_json
+from .util.feeding import create_dataset
 from .util.feeding import audio_to_features, audiofile_to_features, create_dataset
 from .util.helpers import ExceptionBox, check_ctcdecoder_version
 from .util.io import (
    is_remote_path,
    isdir_remote,
    listdir_remote,
    open_remote,
    remove_remote,
 )
 check_ctcdecoder_version()
 # Graph Creation
 # ==============
 def variable_on_cpu(name, shape, initializer):
    r"""
    Next we concern ourselves with graph creation.
    However, before we do so we must introduce a utility function ``variable_on_cpu()``
    used to create a variable in CPU memory.
    """
    # Use the /cpu:0 device for scoped operations
    with tf.device(Config.cpu_device):
        # Create or get apropos variable
        var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
    return var
 def create_overlapping_windows(batch_x):
    batch_size = tf.shape(input=batch_x)[0]
    window_width = 2 * Config.n_context + 1
    num_channels = Config.n_input
    # Create a constant convolution filter using an identity matrix, so that the
    # convolution returns patches of the input tensor as is, and we can create
    # overlapping windows over the MFCCs.
    eye_filter = tf.constant(
        np.eye(window_width * num_channels).reshape(
            window_width, num_channels, window_width * num_channels
        ),
        tf.float32,
    )  # pylint: disable=bad-continuation
    # Create overlapping windows
    batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
    # Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
    batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
    return batch_x
 def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
    with tfv1.variable_scope(name):
        bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
        weights = variable_on_cpu(
            "weights",
            [x.shape[-1], units],
            tfv1.keras.initializers.VarianceScaling(
                scale=1.0, mode="fan_avg", distribution="uniform"
            ),
        )
    output = tf.nn.bias_add(tf.matmul(x, weights), bias)
    if relu:
        output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
    if layer_norm:
        with tfv1.variable_scope(name):
            output = tf.contrib.layers.layer_norm(output)
    if dropout_rate is not None:
        output = tf.nn.dropout(output, rate=dropout_rate)
    return output
 def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
        fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )
        output, output_state = fw_cell(
            inputs=x,
            dtype=tf.float32,
            sequence_length=seq_length,
            initial_state=previous_state,
        )
    return output, output_state
 def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
    assert (
        previous_state is None
    )  # 'Passing previous state not supported with CuDNN backend'
    # Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
    # the object it creates the variables, and then you just call it several times
    # to enable variable re-use. Because all of our code is structure in an old
    # school TensorFlow structure where you can just call tf.get_variable again with
    # reuse=True to reuse variables, we can't easily make use of the object oriented
    # way CudnnLSTM is implemented, so we save a singleton instance in the function,
    # emulating a static function variable.
    if not rnn_impl_cudnn_rnn.cell:
        # Forward direction cell:
        fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
            num_layers=1,
            num_units=Config.n_cell_dim,
            input_mode="linear_input",
            direction="unidirectional",
            dtype=tf.float32,
        )
        rnn_impl_cudnn_rnn.cell = fw_cell
    output, output_state = rnn_impl_cudnn_rnn.cell(
        inputs=x, sequence_lengths=seq_length
    )
    return output, output_state
 rnn_impl_cudnn_rnn.cell = None
 def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
        # Forward direction cell:
        fw_cell = tfv1.nn.rnn_cell.LSTMCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )
        # Split rank N tensor into list of rank N-1 tensors
        x = [x[l] for l in range(x.shape[0])]
        output, output_state = tfv1.nn.static_rnn(
            cell=fw_cell,
            inputs=x,
            sequence_length=seq_length,
            initial_state=previous_state,
            dtype=tf.float32,
            scope="cell_0",
        )
        output = tf.concat(output, 0)
    return output, output_state
 def create_model(
    batch_x,
    seq_length,
    dropout,
    reuse=False,
    batch_size=None,
    previous_state=None,
    overlap=True,
    rnn_impl=rnn_impl_lstmblockfusedcell,
 ):
    layers = {}
    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
    if not batch_size:
        batch_size = tf.shape(input=batch_x)[0]
    # Create overlapping feature windows if needed
    if overlap:
        batch_x = create_overlapping_windows(batch_x)
    # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
    # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
    # Permute n_steps and batch_size
    batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
    # Reshape to prepare input for first layer
    batch_x = tf.reshape(
        batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
    )  # (n_steps*batch_size, n_input + 2*n_input*n_context)
    layers["input_reshaped"] = batch_x
    # The next three blocks will pass `batch_x` through three hidden layers with
    # clipped RELU activation and dropout.
    layers["layer_1"] = layer_1 = dense(
        "layer_1",
        batch_x,
        Config.n_hidden_1,
        dropout_rate=dropout[0],
        layer_norm=Config.layer_norm,
    )
    layers["layer_2"] = layer_2 = dense(
        "layer_2",
        layer_1,
        Config.n_hidden_2,
        dropout_rate=dropout[1],
        layer_norm=Config.layer_norm,
    )
    layers["layer_3"] = layer_3 = dense(
        "layer_3",
        layer_2,
        Config.n_hidden_3,
        dropout_rate=dropout[2],
        layer_norm=Config.layer_norm,
    )
    # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
    # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
    layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
    # Run through parametrized RNN implementation, as we use different RNNs
    # for training and inference
    output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
    # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
    # to a tensor of shape [n_steps*batch_size, n_cell_dim]
    output = tf.reshape(output, [-1, Config.n_cell_dim])
    layers["rnn_output"] = output
    layers["rnn_output_state"] = output_state
    # Now we feed `output` to the fifth hidden layer with clipped RELU activation
    layers["layer_5"] = layer_5 = dense(
        "layer_5",
        output,
        Config.n_hidden_5,
        dropout_rate=dropout[5],
        layer_norm=Config.layer_norm,
    )
    # Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
    layers["layer_6"] = layer_6 = dense(
        "layer_6", layer_5, Config.n_hidden_6, relu=False
    )
    # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
    # to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
    # Note, that this differs from the input in that it is time-major.
    layer_6 = tf.reshape(
        layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
    )
    layers["raw_logits"] = layer_6
    # Output shape: [n_steps, batch_size, n_hidden_6]
    return layer_6, layers
 # Accuracy and Loss
 # =================
@ -900,371 +660,6 @@ def train():
    log_debug("Session closed.")
 def test():
    tfv1.reset_default_graph()
    samples = evaluate(Config.test_files, create_model)
    if Config.test_output_file:
        save_samples_json(samples, Config.test_output_file)
 def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
    batch_size = batch_size if batch_size > 0 else None
    # Create feature computation graph
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    input_samples = tfv1.placeholder(
        tf.float32, [Config.audio_window_samples], "input_samples"
    )
    samples = tf.expand_dims(input_samples, -1)
    mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    mfccs = tf.identity(mfccs, name="mfccs")
    # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
    # This shape is read by the native_client in STT_CreateModel to know the
    # value of n_steps, n_context and n_input. Make sure you update the code
    # there if this shape is changed.
    #
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    input_tensor = tfv1.placeholder(
        tf.float32,
        [
            batch_size,
            n_steps if n_steps > 0 else None,
            2 * Config.n_context + 1,
            Config.n_input,
        ],
        name="input_node",
    )
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
    if batch_size <= 0:
        # no state management since n_step is expected to be dynamic too (see below)
        previous_state = None
    else:
        # native_client: this node's name and shape are part of the API boundary
        #   with the native client, if you change them you should sync changes with
        #   the C++ code.
        previous_state_c = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
        )
        # native_client: this node's name and shape are part of the API boundary
        #   with the native client, if you change them you should sync changes with
        #   the C++ code.
        previous_state_h = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
        )
        previous_state = tf.nn.rnn_cell.LSTMStateTuple(
            previous_state_c, previous_state_h
        )
    # One rate per layer
    no_dropout = [None] * 6
    if tflite:
        rnn_impl = rnn_impl_static_rnn
    else:
        rnn_impl = rnn_impl_lstmblockfusedcell
    logits, layers = create_model(
        batch_x=input_tensor,
        batch_size=batch_size,
        seq_length=seq_length if not Config.export_tflite else None,
        dropout=no_dropout,
        previous_state=previous_state,
        overlap=False,
        rnn_impl=rnn_impl,
    )
    # TF Lite runtime will check that input dimensions are 1, 2 or 4
    # by default we get 3, the middle one being batch_size which is forced to
    # one on inference graph, so remove that dimension
    #
    # native_client: this node's name and shape are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    if tflite:
        logits = tf.squeeze(logits, [1])
    # Apply softmax for CTC decoder
    probs = tf.nn.softmax(logits, name="logits")
    if batch_size <= 0:
        if tflite:
            raise NotImplementedError(
                "dynamic batch_size does not support tflite nor streaming"
            )
        if n_steps > 0:
            raise NotImplementedError(
                "dynamic batch_size expect n_steps to be dynamic too"
            )
        return (
            {
                "input": input_tensor,
                "input_lengths": seq_length,
            },
            {
                "outputs": probs,
            },
            layers,
        )
    new_state_c, new_state_h = layers["rnn_output_state"]
    new_state_c = tf.identity(new_state_c, name="new_state_c")
    new_state_h = tf.identity(new_state_h, name="new_state_h")
    inputs = {
        "input": input_tensor,
        "previous_state_c": previous_state_c,
        "previous_state_h": previous_state_h,
        "input_samples": input_samples,
    }
    if not Config.export_tflite:
        inputs["input_lengths"] = seq_length
    outputs = {
        "outputs": probs,
        "new_state_c": new_state_c,
        "new_state_h": new_state_h,
        "mfccs": mfccs,
        # Expose internal layers for downstream applications
        "layer_3": layers["layer_3"],
        "layer_5": layers["layer_5"],
    }
    return inputs, outputs, layers
 def file_relative_read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()
 def export():
    r"""
    Restores the trained variables into a simpler graph that will be exported for serving.
    """
    log_info("Exporting the model...")
    tfv1.reset_default_graph()
    inputs, outputs, _ = create_inference_graph(
        batch_size=Config.export_batch_size,
        n_steps=Config.n_steps,
        tflite=Config.export_tflite,
    )
    graph_version = int(file_relative_read("GRAPH_VERSION").strip())
    assert graph_version > 0
    # native_client: these nodes's names and shapes are part of the API boundary
    #   with the native client, if you change them you should sync changes with
    #   the C++ code.
    outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
    outputs["metadata_sample_rate"] = tf.constant(
        [Config.audio_sample_rate], name="metadata_sample_rate"
    )
    outputs["metadata_feature_win_len"] = tf.constant(
        [Config.feature_win_len], name="metadata_feature_win_len"
    )
    outputs["metadata_feature_win_step"] = tf.constant(
        [Config.feature_win_step], name="metadata_feature_win_step"
    )
    outputs["metadata_beam_width"] = tf.constant(
        [Config.export_beam_width], name="metadata_beam_width"
    )
    outputs["metadata_alphabet"] = tf.constant(
        [Config.alphabet.Serialize()], name="metadata_alphabet"
    )
    if Config.export_language:
        outputs["metadata_language"] = tf.constant(
            [Config.export_language.encode("utf-8")], name="metadata_language"
        )
    # Prevent further graph changes
    tfv1.get_default_graph().finalize()
    output_names_tensors = [
        tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
    ]
    output_names_ops = [
        op.name for op in outputs.values() if isinstance(op, tf.Operation)
    ]
    output_names = output_names_tensors + output_names_ops
    with tf.Session() as session:
        # Restore variables from checkpoint
        load_graph_for_evaluation(session)
        output_filename = Config.export_file_name + ".pb"
        if Config.remove_export:
            if isdir_remote(Config.export_dir):
                log_info("Removing old export")
                remove_remote(Config.export_dir)
        output_graph_path = os.path.join(Config.export_dir, output_filename)
        if not is_remote_path(Config.export_dir) and not os.path.isdir(
            Config.export_dir
        ):
            os.makedirs(Config.export_dir)
        frozen_graph = tfv1.graph_util.convert_variables_to_constants(
            sess=session,
            input_graph_def=tfv1.get_default_graph().as_graph_def(),
            output_node_names=output_names,
        )
        frozen_graph = tfv1.graph_util.extract_sub_graph(
            graph_def=frozen_graph, dest_nodes=output_names
        )
        if not Config.export_tflite:
            with open_remote(output_graph_path, "wb") as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(
                Config.export_dir, output_filename.replace(".pb", ".tflite")
            )
            converter = tf.lite.TFLiteConverter(
                frozen_graph,
                input_tensors=inputs.values(),
                output_tensors=outputs.values(),
            )
            if Config.export_quantize:
                converter.optimizations = [tf.lite.Optimize.DEFAULT]
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()
            with open_remote(output_tflite_path, "wb") as fout:
                fout.write(tflite_model)
        log_info("Models exported at %s" % (Config.export_dir))
    metadata_fname = os.path.join(
        Config.export_dir,
        "{}_{}_{}.md".format(
            Config.export_author_id,
            Config.export_model_name,
            Config.export_model_version,
        ),
    )
    model_runtime = "tflite" if Config.export_tflite else "tensorflow"
    with open_remote(metadata_fname, "w") as f:
        f.write("---\n")
        f.write("author: {}\n".format(Config.export_author_id))
        f.write("model_name: {}\n".format(Config.export_model_name))
        f.write("model_version: {}\n".format(Config.export_model_version))
        f.write("contact_info: {}\n".format(Config.export_contact_info))
        f.write("license: {}\n".format(Config.export_license))
        f.write("language: {}\n".format(Config.export_language))
        f.write("runtime: {}\n".format(model_runtime))
        f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
        f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
        f.write(
            "acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
        )
        f.write(
            "scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
        )
        f.write("---\n")
        f.write("{}\n".format(Config.export_description))
    log_info(
        "Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
            metadata_fname
        )
    )
 def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(
        os.path.abspath(Config.export_dir), ""
    )  # Force ending '/'
    if is_remote_path(export_dir):
        log_error(
            "Cannot package remote path zip %s. Please do this manually." % export_dir
        )
        return
    zip_filename = os.path.dirname(export_dir)
    shutil.copy(Config.scorer_path, export_dir)
    archive = shutil.make_archive(zip_filename, "zip", export_dir)
    log_info("Exported packaged model {}".format(archive))
 def do_single_file_inference(input_file_path):
    tfv1.reset_default_graph()
    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
        # Restore variables from training checkpoint
        load_graph_for_evaluation(session)
        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])
        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)
        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)
        probs = outputs["outputs"].eval(
            feed_dict={
                inputs["input"]: features,
                inputs["input_lengths"]: features_len,
                inputs["previous_state_c"]: previous_state_c,
                inputs["previous_state_h"]: previous_state_h,
            },
            session=session,
        )
        probs = np.squeeze(probs)
        if Config.scorer_path:
            scorer = Scorer(
                Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
            )
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(
            probs,
            Config.alphabet,
            Config.beam_width,
            scorer=scorer,
            cutoff_prob=Config.cutoff_prob,
            cutoff_top_n=Config.cutoff_top_n,
        )
        # Print highest probability result
        print(decoded[0][1])
 def early_training_checks():
    # Check for proper scorer early
    if Config.scorer_path:
@ -1289,36 +684,47 @@ def early_training_checks():
        )
    if not Config.alphabet_config_path and not Config.bytes_output_mode:
-        log_error("Missing --alphabet_config_path flag, can't continue")
+        raise RuntimeError("Missing --alphabet_config_path flag, can't continue")
        sys.exit(1)
 def main():
    initialize_globals_from_cli()
    def deprecated_msg(prefix):
        return (
            f"{prefix} Using the training script as a generic driver for all training "
            "related functionality is deprecated and will be removed soon. Use "
            "the specific scripts: train.py/evaluate.py/export.py/training_graph_inference.py."
        )
    if Config.train_files:
        train()
    else:
        log_warn(deprecated_msg("Calling training script without --train_files."))
    if Config.test_files:
-        test()
+        log_warn(
-
+            deprecated_msg(
-    if Config.export_dir and not Config.export_zip:
+                "Specifying --test_files when calling train.py script. Use evaluate.py."
        export()
    if Config.export_zip:
        Config.export_tflite = True
        if listdir_remote(Config.export_dir):
            log_error(
                "Directory {} is not empty, please fix this.".format(Config.export_dir)
            )
-            sys.exit(1)
+        )
        evaluate.test()
-        export()
+    if Config.export_dir:
-        package_zip()
+        log_warn(
            deprecated_msg(
                "Specifying --export_dir when calling train.py script. Use export.py."
            )
        )
        export.export()
    if Config.one_shot_infer:
-        do_single_file_inference(Config.one_shot_infer)
+        log_warn(
            deprecated_msg(
                "Specifying --one_shot_infer when calling train.py script. Use training_graph_inference.py."
            )
        )
        traning_graph_inference.do_single_file_inference(Config.one_shot_infer)
 if __name__ == "__main__":
--- a/training/coqui_stt_training/training_graph_inference.py
+++ b/training/coqui_stt_training/training_graph_inference.py
@ -0,0 +1,87 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import os
 import sys
 LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
 DESIRED_LOG_LEVEL = (
    sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
 )
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
 import numpy as np
 import tensorflow as tf
 import tensorflow.compat.v1 as tfv1
 from coqui_stt_ctcdecoder import ctc_beam_search_decoder, Scorer
 from .deepspeech_model import create_inference_graph, create_overlapping_windows
 from .util.checkpoints import load_graph_for_evaluation
 from .util.config import Config, initialize_globals_from_cli, log_error
 from .util.feeding import audiofile_to_features
 def do_single_file_inference(input_file_path):
    tfv1.reset_default_graph()
    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
        # Restore variables from training checkpoint
        load_graph_for_evaluation(session)
        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])
        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)
        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)
        probs = outputs["outputs"].eval(
            feed_dict={
                inputs["input"]: features,
                inputs["input_lengths"]: features_len,
                inputs["previous_state_c"]: previous_state_c,
                inputs["previous_state_h"]: previous_state_h,
            },
            session=session,
        )
        probs = np.squeeze(probs)
        if Config.scorer_path:
            scorer = Scorer(
                Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
            )
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(
            probs,
            Config.alphabet,
            Config.beam_width,
            scorer=scorer,
            cutoff_prob=Config.cutoff_prob,
            cutoff_top_n=Config.cutoff_top_n,
        )
        # Print highest probability result
        print(decoded[0][1])
 def main():
    initialize_globals_from_cli()
    if Config.one_shot_infer:
        tfv1.reset_default_graph()
        do_single_file_inference(Config.one_shot_infer)
    else:
        raise RuntimeError(
            "Calling training_graph_inference script directly but no --one_shot_infer input audio file specified"
        )
 if __name__ == "__main__":
    main()
--- a/training/coqui_stt_training/util/config.py
+++ b/training/coqui_stt_training/util/config.py
@ -477,7 +477,7 @@ class _SttConfig(Coqpit):
        default=False, metadata=dict(help="whether to remove old exported models")
    )
    export_tflite: bool = field(
-        default=False, metadata=dict(help="export a graph ready for TF Lite engine")
+        default=True, metadata=dict(help="export a graph ready for TF Lite engine")
    )
    export_quantize: bool = field(
        default=True,
--- a/training/coqui_stt_training/util/io.py
+++ b/training/coqui_stt_training/util/io.py
@ -90,3 +90,10 @@ def remove_remote(filename):
    """
    # Conditional import
    return gfile.remove(filename)
 def rmtree_remote(foldername):
    """
    Wrapper that can remove local and remote directories like `gs://...`
    """
    return gfile.rmtree(foldername)