STT/training/coqui_stt_training/train.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function

import os
import sys

LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
DESIRED_LOG_LEVEL = (
    sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL

import json
import shutil
import time

import numpy as np
import progressbar
import tensorflow.compat.v1 as tfv1

import tensorflow as tf

tfv1.logging.set_verbosity(
    {
        "0": tfv1.logging.DEBUG,
        "1": tfv1.logging.INFO,
        "2": tfv1.logging.WARN,
        "3": tfv1.logging.ERROR,
    }.get(DESIRED_LOG_LEVEL)
)

from datetime import datetime

from coqui_stt_ctcdecoder import Scorer, ctc_beam_search_decoder
from six.moves import range, zip

from .evaluate import evaluate
from .util.augmentations import NormalizeSampleRate
from .util.checkpoints import (
    load_graph_for_evaluation,
    load_or_init_graph_for_training,
    reload_best_checkpoint,
)
from .util.config import (
    Config,
    create_progressbar,
    initialize_globals_from_cli,
    log_debug,
    log_error,
    log_info,
    log_progress,
    log_warn,
)
from .util.evaluate_tools import save_samples_json
from .util.feeding import audio_to_features, audiofile_to_features, create_dataset
from .util.helpers import ExceptionBox, check_ctcdecoder_version
from .util.io import (
    is_remote_path,
    isdir_remote,
    listdir_remote,
    open_remote,
    remove_remote,
)

check_ctcdecoder_version()

# Graph Creation
# ==============


def variable_on_cpu(name, shape, initializer):
    r"""
    Next we concern ourselves with graph creation.
    However, before we do so we must introduce a utility function ``variable_on_cpu()``
    used to create a variable in CPU memory.
    """
    # Use the /cpu:0 device for scoped operations
    with tf.device(Config.cpu_device):
        # Create or get apropos variable
        var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
    return var


def create_overlapping_windows(batch_x):
    batch_size = tf.shape(input=batch_x)[0]
    window_width = 2 * Config.n_context + 1
    num_channels = Config.n_input

    # Create a constant convolution filter using an identity matrix, so that the
    # convolution returns patches of the input tensor as is, and we can create
    # overlapping windows over the MFCCs.
    eye_filter = tf.constant(
        np.eye(window_width * num_channels).reshape(
            window_width, num_channels, window_width * num_channels
        ),
        tf.float32,
    )  # pylint: disable=bad-continuation

    # Create overlapping windows
    batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")

    # Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
    batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])

    return batch_x


def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
    with tfv1.variable_scope(name):
        bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
        weights = variable_on_cpu(
            "weights",
            [x.shape[-1], units],
            tfv1.keras.initializers.VarianceScaling(
                scale=1.0, mode="fan_avg", distribution="uniform"
            ),
        )

    output = tf.nn.bias_add(tf.matmul(x, weights), bias)

    if relu:
        output = tf.minimum(tf.nn.relu(output), Config.relu_clip)

    if layer_norm:
        with tfv1.variable_scope(name):
            output = tf.contrib.layers.layer_norm(output)

    if dropout_rate is not None:
        output = tf.nn.dropout(output, rate=dropout_rate)

    return output


def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
        fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )

        output, output_state = fw_cell(
            inputs=x,
            dtype=tf.float32,
            sequence_length=seq_length,
            initial_state=previous_state,
        )

    return output, output_state


def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
    assert (
        previous_state is None
    )  # 'Passing previous state not supported with CuDNN backend'

    # Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
    # the object it creates the variables, and then you just call it several times
    # to enable variable re-use. Because all of our code is structure in an old
    # school TensorFlow structure where you can just call tf.get_variable again with
    # reuse=True to reuse variables, we can't easily make use of the object oriented
    # way CudnnLSTM is implemented, so we save a singleton instance in the function,
    # emulating a static function variable.
    if not rnn_impl_cudnn_rnn.cell:
        # Forward direction cell:
        fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
            num_layers=1,
            num_units=Config.n_cell_dim,
            input_mode="linear_input",
            direction="unidirectional",
            dtype=tf.float32,
        )
        rnn_impl_cudnn_rnn.cell = fw_cell

    output, output_state = rnn_impl_cudnn_rnn.cell(
        inputs=x, sequence_lengths=seq_length
    )

    return output, output_state


rnn_impl_cudnn_rnn.cell = None


def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
    with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
        # Forward direction cell:
        fw_cell = tfv1.nn.rnn_cell.LSTMCell(
            Config.n_cell_dim,
            forget_bias=0,
            reuse=reuse,
            name="cudnn_compatible_lstm_cell",
        )

        # Split rank N tensor into list of rank N-1 tensors
        x = [x[l] for l in range(x.shape[0])]

        output, output_state = tfv1.nn.static_rnn(
            cell=fw_cell,
            inputs=x,
            sequence_length=seq_length,
            initial_state=previous_state,
            dtype=tf.float32,
            scope="cell_0",
        )

        output = tf.concat(output, 0)

    return output, output_state


def create_model(
    batch_x,
    seq_length,
    dropout,
    reuse=False,
    batch_size=None,
    previous_state=None,
    overlap=True,
    rnn_impl=rnn_impl_lstmblockfusedcell,
):
    layers = {}

    # Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
    if not batch_size:
        batch_size = tf.shape(input=batch_x)[0]

    # Create overlapping feature windows if needed
    if overlap:
        batch_x = create_overlapping_windows(batch_x)

    # Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
    # This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.

    # Permute n_steps and batch_size
    batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
    # Reshape to prepare input for first layer
    batch_x = tf.reshape(
        batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
    )  # (n_steps*batch_size, n_input + 2*n_input*n_context)
    layers["input_reshaped"] = batch_x

    # The next three blocks will pass `batch_x` through three hidden layers with
    # clipped RELU activation and dropout.
    layers["layer_1"] = layer_1 = dense(
        "layer_1",
        batch_x,
        Config.n_hidden_1,
        dropout_rate=dropout[0],
        layer_norm=Config.layer_norm,
    )
    layers["layer_2"] = layer_2 = dense(
        "layer_2",
        layer_1,
        Config.n_hidden_2,
        dropout_rate=dropout[1],
        layer_norm=Config.layer_norm,
    )
    layers["layer_3"] = layer_3 = dense(
        "layer_3",
        layer_2,
        Config.n_hidden_3,
        dropout_rate=dropout[2],
        layer_norm=Config.layer_norm,
    )

    # `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
    # as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
    layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])

    # Run through parametrized RNN implementation, as we use different RNNs
    # for training and inference
    output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)

    # Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
    # to a tensor of shape [n_steps*batch_size, n_cell_dim]
    output = tf.reshape(output, [-1, Config.n_cell_dim])
    layers["rnn_output"] = output
    layers["rnn_output_state"] = output_state

    # Now we feed `output` to the fifth hidden layer with clipped RELU activation
    layers["layer_5"] = layer_5 = dense(
        "layer_5",
        output,
        Config.n_hidden_5,
        dropout_rate=dropout[5],
        layer_norm=Config.layer_norm,
    )

    # Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
    layers["layer_6"] = layer_6 = dense(
        "layer_6", layer_5, Config.n_hidden_6, relu=False
    )

    # Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
    # to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
    # Note, that this differs from the input in that it is time-major.
    layer_6 = tf.reshape(
        layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
    )
    layers["raw_logits"] = layer_6

    # Output shape: [n_steps, batch_size, n_hidden_6]
    return layer_6, layers


# Accuracy and Loss
# =================

# In accord with 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# the loss function used by our network should be the CTC loss function
# (http://www.cs.toronto.edu/~graves/preprint.pdf).
# Conveniently, this loss function is implemented in TensorFlow.
# Thus, we can simply make use of this implementation to define our loss.


def calculate_mean_edit_distance_and_loss(iterator, dropout, reuse):
    r"""
    This routine beam search decodes a mini-batch and calculates the loss and mean edit distance.
    Next to total and average loss it returns the mean edit distance,
    the decoded result and the batch's original Y.
    """
    # Obtain the next batch of data
    batch_filenames, (batch_x, batch_seq_len), batch_y = iterator.get_next()

    if Config.train_cudnn:
        rnn_impl = rnn_impl_cudnn_rnn
    else:
        rnn_impl = rnn_impl_lstmblockfusedcell

    # Calculate the logits of the batch
    logits, _ = create_model(
        batch_x, batch_seq_len, dropout, reuse=reuse, rnn_impl=rnn_impl
    )

    # Compute the CTC loss using TensorFlow's `ctc_loss`
    total_loss = tfv1.nn.ctc_loss(
        labels=batch_y, inputs=logits, sequence_length=batch_seq_len
    )

    # Check if any files lead to non finite loss
    non_finite_files = tf.gather(
        batch_filenames, tfv1.where(~tf.math.is_finite(total_loss))
    )

    # Calculate the average loss across the batch
    avg_loss = tf.reduce_mean(input_tensor=total_loss)

    # Finally we return the average loss
    return avg_loss, non_finite_files


# Adam Optimization
# =================

# In contrast to 'Deep Speech: Scaling up end-to-end speech recognition'
# (http://arxiv.org/abs/1412.5567),
# in which 'Nesterov's Accelerated Gradient Descent'
# (www.cs.toronto.edu/~fritz/absps/momentum.pdf) was used,
# we will use the Adam method for optimization (http://arxiv.org/abs/1412.6980),
# because, generally, it requires less fine-tuning.
def create_optimizer(learning_rate_var):
    optimizer = tfv1.train.AdamOptimizer(
        learning_rate=learning_rate_var,
        beta1=Config.beta1,
        beta2=Config.beta2,
        epsilon=Config.epsilon,
    )
    return optimizer


# Towers
# ======

# In order to properly make use of multiple GPU's, one must introduce new abstractions,
# not present when using a single GPU, that facilitate the multi-GPU use case.
# In particular, one must introduce a means to isolate the inference and gradient
# calculations on the various GPU's.
# The abstraction we intoduce for this purpose is called a 'tower'.
# A tower is specified by two properties:
# * **Scope** - A scope, as provided by `tf.name_scope()`,
# is a means to isolate the operations within a tower.
# For example, all operations within 'tower 0' could have their name prefixed with `tower_0/`.
# * **Device** - A hardware device, as provided by `tf.device()`,
# on which all operations within the tower execute.
# For example, all operations of 'tower 0' could execute on the first GPU `tf.device('/gpu:0')`.


def get_tower_results(iterator, optimizer, dropout_rates):
    r"""
    With this preliminary step out of the way, we can for each GPU introduce a
    tower for which's batch we calculate and return the optimization gradients
    and the average loss across towers.
    """
    # To calculate the mean of the losses
    tower_avg_losses = []

    # Tower gradients to return
    tower_gradients = []

    # Aggregate any non finite files in the batches
    tower_non_finite_files = []

    with tfv1.variable_scope(tfv1.get_variable_scope()):
        # Loop over available_devices
        for i in range(len(Config.available_devices)):
            # Execute operations of tower i on device i
            device = Config.available_devices[i]
            with tf.device(device):
                # Create a scope for all operations of tower i
                with tf.name_scope("tower_%d" % i):
                    # Calculate the avg_loss and mean_edit_distance and retrieve the decoded
                    # batch along with the original batch's labels (Y) of this tower
                    avg_loss, non_finite_files = calculate_mean_edit_distance_and_loss(
                        iterator, dropout_rates, reuse=i > 0
                    )

                    # Allow for variables to be re-used by the next tower
                    tfv1.get_variable_scope().reuse_variables()

                    # Retain tower's avg losses
                    tower_avg_losses.append(avg_loss)

                    # Compute gradients for model parameters using tower's mini-batch
                    gradients = optimizer.compute_gradients(avg_loss)

                    # Retain tower's gradients
                    tower_gradients.append(gradients)

                    tower_non_finite_files.append(non_finite_files)

    avg_loss_across_towers = tf.reduce_mean(input_tensor=tower_avg_losses, axis=0)
    tfv1.summary.scalar(
        name="step_loss", tensor=avg_loss_across_towers, collections=["step_summaries"]
    )

    all_non_finite_files = tf.concat(tower_non_finite_files, axis=0)

    # Return gradients and the average loss
    return tower_gradients, avg_loss_across_towers, all_non_finite_files


def average_gradients(tower_gradients):
    r"""
    A routine for computing each variable's average of the gradients obtained from the GPUs.
    Note also that this code acts as a synchronization point as it requires all
    GPUs to be finished with their mini-batch before it can run to completion.
    """
    # List of average gradients to return to the caller
    average_grads = []

    # Run this on cpu_device to conserve GPU memory
    with tf.device(Config.cpu_device):
        # Loop over gradient/variable pairs from all towers
        for grad_and_vars in zip(*tower_gradients):
            # Introduce grads to store the gradients for the current variable
            grads = []

            # Loop over the gradients for the current variable
            for g, _ in grad_and_vars:
                # Add 0 dimension to the gradients to represent the tower.
                expanded_g = tf.expand_dims(g, 0)
                # Append on a 'tower' dimension which we will average over below.
                grads.append(expanded_g)

            # Average over the 'tower' dimension
            grad = tf.concat(grads, 0)
            grad = tf.reduce_mean(input_tensor=grad, axis=0)

            # Create a gradient/variable tuple for the current variable with its average gradient
            grad_and_var = (grad, grad_and_vars[0][1])

            # Add the current tuple to average_grads
            average_grads.append(grad_and_var)

    # Return result to caller
    return average_grads


# Logging
# =======


def log_variable(variable, gradient=None):
    r"""
    We introduce a function for logging a tensor variable's current state.
    It logs scalar values for the mean, standard deviation, minimum and maximum.
    Furthermore it logs a histogram of its state and (if given) of an optimization gradient.
    """
    name = variable.name.replace(":", "_")
    mean = tf.reduce_mean(input_tensor=variable)
    tfv1.summary.scalar(name="%s/mean" % name, tensor=mean)
    tfv1.summary.scalar(
        name="%s/sttdev" % name,
        tensor=tf.sqrt(tf.reduce_mean(input_tensor=tf.square(variable - mean))),
    )
    tfv1.summary.scalar(
        name="%s/max" % name, tensor=tf.reduce_max(input_tensor=variable)
    )
    tfv1.summary.scalar(
        name="%s/min" % name, tensor=tf.reduce_min(input_tensor=variable)
    )
    tfv1.summary.histogram(name=name, values=variable)
    if gradient is not None:
        if isinstance(gradient, tf.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient
        if grad_values is not None:
            tfv1.summary.histogram(name="%s/gradients" % name, values=grad_values)


def log_grads_and_vars(grads_and_vars):
    r"""
    Let's also introduce a helper function for logging collections of gradient/variable tuples.
    """
    for gradient, variable in grads_and_vars:
        log_variable(variable, gradient=gradient)


def train():
    tfv1.reset_default_graph()
    tfv1.set_random_seed(Config.random_seed)

    exception_box = ExceptionBox()

    # Create training and validation datasets
    train_set = create_dataset(
        Config.train_files,
        batch_size=Config.train_batch_size,
        epochs=Config.epochs,
        augmentations=Config.augmentations,
        cache_path=Config.feature_cache,
        train_phase=True,
        exception_box=exception_box,
        process_ahead=len(Config.available_devices) * Config.train_batch_size * 2,
        reverse=Config.reverse_train,
        limit=Config.limit_train,
        buffering=Config.read_buffer,
    )

    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(train_set),
        tfv1.data.get_output_shapes(train_set),
        output_classes=tfv1.data.get_output_classes(train_set),
    )

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if Config.dev_files:
        dev_sources = Config.dev_files
        dev_sets = [
            create_dataset(
                [source],
                batch_size=Config.dev_batch_size,
                train_phase=False,
                augmentations=[NormalizeSampleRate(Config.audio_sample_rate)],
                exception_box=exception_box,
                process_ahead=len(Config.available_devices) * Config.dev_batch_size * 2,
                reverse=Config.reverse_dev,
                limit=Config.limit_dev,
                buffering=Config.read_buffer,
            )
            for source in dev_sources
        ]
        dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]

    if Config.metrics_files:
        metrics_sources = Config.metrics_files
        metrics_sets = [
            create_dataset(
                [source],
                batch_size=Config.dev_batch_size,
                train_phase=False,
                augmentations=[NormalizeSampleRate(Config.audio_sample_rate)],
                exception_box=exception_box,
                process_ahead=len(Config.available_devices) * Config.dev_batch_size * 2,
                reverse=Config.reverse_dev,
                limit=Config.limit_dev,
                buffering=Config.read_buffer,
            )
            for source in metrics_sources
        ]
        metrics_init_ops = [
            iterator.make_initializer(metrics_set) for metrics_set in metrics_sets
        ]

    # Dropout
    dropout_rates = [
        tfv1.placeholder(tf.float32, name="dropout_{}".format(i)) for i in range(6)
    ]
    dropout_feed_dict = {
        dropout_rates[0]: Config.dropout_rate,
        dropout_rates[1]: Config.dropout_rate2,
        dropout_rates[2]: Config.dropout_rate3,
        dropout_rates[3]: Config.dropout_rate4,
        dropout_rates[4]: Config.dropout_rate5,
        dropout_rates[5]: Config.dropout_rate6,
    }
    no_dropout_feed_dict = {rate: 0.0 for rate in dropout_rates}

    # Building the graph
    learning_rate_var = tfv1.get_variable(
        "learning_rate", initializer=Config.learning_rate, trainable=False
    )
    reduce_learning_rate_op = learning_rate_var.assign(
        tf.multiply(learning_rate_var, Config.plateau_reduction)
    )
    optimizer = create_optimizer(learning_rate_var)

    # Enable mixed precision training
    if Config.automatic_mixed_precision:
        log_info("Enabling automatic mixed precision training.")
        optimizer = tfv1.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer
        )

    gradients, loss, non_finite_files = get_tower_results(
        iterator, optimizer, dropout_rates
    )

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tfv1.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(
        avg_tower_gradients, global_step=global_step
    )

    # Summaries
    step_summaries_op = tfv1.summary.merge_all("step_summaries")
    step_summary_writers = {
        "train": tfv1.summary.FileWriter(
            os.path.join(Config.summary_dir, "train"), max_queue=120
        ),
        "dev": tfv1.summary.FileWriter(
            os.path.join(Config.summary_dir, "dev"), max_queue=120
        ),
        "metrics": tfv1.summary.FileWriter(
            os.path.join(Config.summary_dir, "metrics"), max_queue=120
        ),
    }

    human_readable_set_names = {
        "train": "Training",
        "dev": "Validation",
        "metrics": "Metrics",
    }

    # Checkpointing
    checkpoint_saver = tfv1.train.Saver(max_to_keep=Config.max_to_keep)
    checkpoint_path = os.path.join(Config.save_checkpoint_dir, "train")

    best_dev_saver = tfv1.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(Config.save_checkpoint_dir, "best_dev")

    # Save flags next to checkpoints
    if not is_remote_path(Config.save_checkpoint_dir):
        os.makedirs(Config.save_checkpoint_dir, exist_ok=True)
    flags_file = os.path.join(Config.save_checkpoint_dir, "flags.txt")
    with open_remote(flags_file, "w") as fout:
        json.dump(Config.serialize(), fout, indent=2)

    with tfv1.Session(config=Config.session_config) as session:
        log_debug("Session opened.")

        # Prevent further graph changes
        tfv1.get_default_graph().finalize()

        # Load checkpoint or initialize variables
        load_or_init_graph_for_training(session)

        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == "train"
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            if is_train and Config.cache_for_epochs > 0 and Config.feature_cache:
                feature_cache_index = Config.feature_cache + ".index"
                if epoch % Config.cache_for_epochs == 0 and os.path.isfile(
                    feature_cache_index
                ):
                    log_info("Invalidating feature cache")
                    remove_remote(
                        feature_cache_index
                    )  # this will let TF also overwrite the related cache data files

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(
                        self, format="Loss: %(mean_loss)f"
                    )

                def __call__(self, progress, data, **kwargs):
                    data["mean_loss"] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(
                        self, progress, data, **kwargs
                    )

            prefix = "Epoch {} | {:>10}".format(
                epoch, human_readable_set_names[set_name]
            )
            widgets = [
                " | ",
                progressbar.widgets.Timer(),
                " | Steps: ",
                progressbar.widgets.Counter(),
                " | ",
                LossWidget(),
            ]
            suffix = " | Dataset: {}".format(dataset) if dataset else None
            pbar = create_progressbar(
                prefix=prefix, widgets=widgets, suffix=suffix
            ).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    (
                        _,
                        current_step,
                        batch_loss,
                        problem_files,
                        step_summary,
                    ) = session.run(
                        [
                            train_op,
                            global_step,
                            loss,
                            non_finite_files,
                            step_summaries_op,
                        ],
                        feed_dict=feed_dict,
                    )
                    exception_box.raise_if_set()
                except tf.errors.OutOfRangeError:
                    exception_box.raise_if_set()
                    break

                if problem_files.size > 0:
                    problem_files = [f.decode("utf8") for f in problem_files[..., 0]]
                    log_error(
                        "The following files caused an infinite (or NaN) "
                        "loss: {}".format(",".join(problem_files))
                    )

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if (
                    is_train
                    and Config.checkpoint_secs > 0
                    and time.time() - checkpoint_time > Config.checkpoint_secs
                ):
                    checkpoint_saver.save(
                        session, checkpoint_path, global_step=current_step
                    )
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count

        log_info("STARTING Optimization")
        train_start_time = datetime.utcnow()
        best_dev_loss = float("inf")
        dev_losses = []
        epochs_without_improvement = 0
        try:
            for epoch in range(Config.epochs):
                # Training
                log_progress("Training epoch %d..." % epoch)
                train_loss, _ = run_set("train", epoch, train_init_op)
                log_progress(
                    "Finished training epoch %d - loss: %f" % (epoch, train_loss)
                )
                checkpoint_saver.save(session, checkpoint_path, global_step=global_step)

                if Config.dev_files:
                    # Validation
                    dev_loss = 0.0
                    total_steps = 0
                    for source, init_op in zip(dev_sources, dev_init_ops):
                        log_progress("Validating epoch %d on %s..." % (epoch, source))
                        set_loss, steps = run_set("dev", epoch, init_op, dataset=source)
                        dev_loss += set_loss * steps
                        total_steps += steps
                        log_progress(
                            "Finished validating epoch %d on %s - loss: %f"
                            % (epoch, source, set_loss)
                        )

                    dev_loss = dev_loss / total_steps
                    dev_losses.append(dev_loss)

                    # Count epochs without an improvement for early stopping and reduction of learning rate on a plateau
                    # the improvement has to be greater than Config.es_min_delta
                    if dev_loss > best_dev_loss - Config.es_min_delta:
                        epochs_without_improvement += 1
                    else:
                        epochs_without_improvement = 0

                    # Save new best model
                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(
                            session,
                            best_dev_path,
                            global_step=global_step,
                            latest_filename="best_dev_checkpoint",
                        )
                        log_info(
                            "Saved new best validating model with loss %f to: %s"
                            % (best_dev_loss, save_path)
                        )

                    # Early stopping
                    if (
                        Config.early_stop
                        and epochs_without_improvement == Config.es_epochs
                    ):
                        log_info(
                            "Early stop triggered as the loss did not improve the last {} epochs".format(
                                epochs_without_improvement
                            )
                        )
                        break

                    # Reduce learning rate on plateau
                    # If the learning rate was reduced and there is still no improvement
                    # wait Config.plateau_epochs before the learning rate is reduced again
                    if (
                        Config.reduce_lr_on_plateau
                        and epochs_without_improvement > 0
                        and epochs_without_improvement % Config.plateau_epochs == 0
                    ):
                        # Reload checkpoint that we use the best_dev weights again
                        reload_best_checkpoint(session)

                        # Reduce learning rate
                        session.run(reduce_learning_rate_op)
                        current_learning_rate = learning_rate_var.eval()
                        log_info(
                            "Encountered a plateau, reducing learning rate to {}".format(
                                current_learning_rate
                            )
                        )

                        # Overwrite best checkpoint with new learning rate value
                        save_path = best_dev_saver.save(
                            session,
                            best_dev_path,
                            global_step=global_step,
                            latest_filename="best_dev_checkpoint",
                        )
                        log_info(
                            "Saved best validating model with reduced learning rate to: %s"
                            % (save_path)
                        )

                if Config.metrics_files:
                    # Read only metrics, not affecting best validation loss tracking
                    for source, init_op in zip(metrics_sources, metrics_init_ops):
                        log_progress("Metrics for epoch %d on %s..." % (epoch, source))
                        set_loss, _ = run_set("metrics", epoch, init_op, dataset=source)
                        log_progress(
                            "Metrics for epoch %d on %s - loss: %f"
                            % (epoch, source, set_loss)
                        )

                print("-" * 80)

        except KeyboardInterrupt:
            pass
        log_info(
            "FINISHED optimization in {}".format(datetime.utcnow() - train_start_time)
        )
    log_debug("Session closed.")


def test():
    tfv1.reset_default_graph()

    samples = evaluate(Config.test_files, create_model)
    if Config.test_output_file:
        save_samples_json(samples, Config.test_output_file)


def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
    batch_size = batch_size if batch_size > 0 else None

    # Create feature computation graph
    input_samples = tfv1.placeholder(
        tf.float32, [Config.audio_window_samples], "input_samples"
    )
    samples = tf.expand_dims(input_samples, -1)
    mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
    mfccs = tf.identity(mfccs, name="mfccs")

    # Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
    # This shape is read by the native_client in STT_CreateModel to know the
    # value of n_steps, n_context and n_input. Make sure you update the code
    # there if this shape is changed.
    input_tensor = tfv1.placeholder(
        tf.float32,
        [
            batch_size,
            n_steps if n_steps > 0 else None,
            2 * Config.n_context + 1,
            Config.n_input,
        ],
        name="input_node",
    )
    seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")

    if batch_size <= 0:
        # no state management since n_step is expected to be dynamic too (see below)
        previous_state = None
    else:
        previous_state_c = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
        )
        previous_state_h = tfv1.placeholder(
            tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
        )

        previous_state = tf.nn.rnn_cell.LSTMStateTuple(
            previous_state_c, previous_state_h
        )

    # One rate per layer
    no_dropout = [None] * 6

    if tflite:
        rnn_impl = rnn_impl_static_rnn
    else:
        rnn_impl = rnn_impl_lstmblockfusedcell

    logits, layers = create_model(
        batch_x=input_tensor,
        batch_size=batch_size,
        seq_length=seq_length if not Config.export_tflite else None,
        dropout=no_dropout,
        previous_state=previous_state,
        overlap=False,
        rnn_impl=rnn_impl,
    )

    # TF Lite runtime will check that input dimensions are 1, 2 or 4
    # by default we get 3, the middle one being batch_size which is forced to
    # one on inference graph, so remove that dimension
    if tflite:
        logits = tf.squeeze(logits, [1])

    # Apply softmax for CTC decoder
    probs = tf.nn.softmax(logits, name="logits")

    if batch_size <= 0:
        if tflite:
            raise NotImplementedError(
                "dynamic batch_size does not support tflite nor streaming"
            )
        if n_steps > 0:
            raise NotImplementedError(
                "dynamic batch_size expect n_steps to be dynamic too"
            )
        return (
            {
                "input": input_tensor,
                "input_lengths": seq_length,
            },
            {
                "outputs": probs,
            },
            layers,
        )

    new_state_c, new_state_h = layers["rnn_output_state"]
    new_state_c = tf.identity(new_state_c, name="new_state_c")
    new_state_h = tf.identity(new_state_h, name="new_state_h")

    inputs = {
        "input": input_tensor,
        "previous_state_c": previous_state_c,
        "previous_state_h": previous_state_h,
        "input_samples": input_samples,
    }

    if not Config.export_tflite:
        inputs["input_lengths"] = seq_length

    outputs = {
        "outputs": probs,
        "new_state_c": new_state_c,
        "new_state_h": new_state_h,
        "mfccs": mfccs,
        # Expose internal layers for downstream applications
        "layer_3": layers["layer_3"],
        "layer_5": layers["layer_5"],
    }

    return inputs, outputs, layers


def file_relative_read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()


def export():
    r"""
    Restores the trained variables into a simpler graph that will be exported for serving.
    """
    log_info("Exporting the model...")

    tfv1.reset_default_graph()

    inputs, outputs, _ = create_inference_graph(
        batch_size=Config.export_batch_size,
        n_steps=Config.n_steps,
        tflite=Config.export_tflite,
    )

    graph_version = int(file_relative_read("GRAPH_VERSION").strip())
    assert graph_version > 0

    outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
    outputs["metadata_sample_rate"] = tf.constant(
        [Config.audio_sample_rate], name="metadata_sample_rate"
    )
    outputs["metadata_feature_win_len"] = tf.constant(
        [Config.feature_win_len], name="metadata_feature_win_len"
    )
    outputs["metadata_feature_win_step"] = tf.constant(
        [Config.feature_win_step], name="metadata_feature_win_step"
    )
    outputs["metadata_beam_width"] = tf.constant(
        [Config.export_beam_width], name="metadata_beam_width"
    )
    outputs["metadata_alphabet"] = tf.constant(
        [Config.alphabet.Serialize()], name="metadata_alphabet"
    )

    if Config.export_language:
        outputs["metadata_language"] = tf.constant(
            [Config.export_language.encode("utf-8")], name="metadata_language"
        )

    # Prevent further graph changes
    tfv1.get_default_graph().finalize()

    output_names_tensors = [
        tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
    ]
    output_names_ops = [
        op.name for op in outputs.values() if isinstance(op, tf.Operation)
    ]
    output_names = output_names_tensors + output_names_ops

    with tf.Session() as session:
        # Restore variables from checkpoint
        load_graph_for_evaluation(session)

        output_filename = Config.export_file_name + ".pb"
        if Config.remove_export:
            if isdir_remote(Config.export_dir):
                log_info("Removing old export")
                remove_remote(Config.export_dir)

        output_graph_path = os.path.join(Config.export_dir, output_filename)

        if not is_remote_path(Config.export_dir) and not os.path.isdir(
            Config.export_dir
        ):
            os.makedirs(Config.export_dir)

        frozen_graph = tfv1.graph_util.convert_variables_to_constants(
            sess=session,
            input_graph_def=tfv1.get_default_graph().as_graph_def(),
            output_node_names=output_names,
        )

        frozen_graph = tfv1.graph_util.extract_sub_graph(
            graph_def=frozen_graph, dest_nodes=output_names
        )

        if not Config.export_tflite:
            with open_remote(output_graph_path, "wb") as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(
                Config.export_dir, output_filename.replace(".pb", ".tflite")
            )

            converter = tf.lite.TFLiteConverter(
                frozen_graph,
                input_tensors=inputs.values(),
                output_tensors=outputs.values(),
            )

            if Config.export_quantize:
                converter.optimizations = [tf.lite.Optimize.DEFAULT]

            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()

            with open_remote(output_tflite_path, "wb") as fout:
                fout.write(tflite_model)

        log_info("Models exported at %s" % (Config.export_dir))

    metadata_fname = os.path.join(
        Config.export_dir,
        "{}_{}_{}.md".format(
            Config.export_author_id,
            Config.export_model_name,
            Config.export_model_version,
        ),
    )

    model_runtime = "tflite" if Config.export_tflite else "tensorflow"
    with open_remote(metadata_fname, "w") as f:
        f.write("---\n")
        f.write("author: {}\n".format(Config.export_author_id))
        f.write("model_name: {}\n".format(Config.export_model_name))
        f.write("model_version: {}\n".format(Config.export_model_version))
        f.write("contact_info: {}\n".format(Config.export_contact_info))
        f.write("license: {}\n".format(Config.export_license))
        f.write("language: {}\n".format(Config.export_language))
        f.write("runtime: {}\n".format(model_runtime))
        f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
        f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
        f.write(
            "acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
        )
        f.write(
            "scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
        )
        f.write("---\n")
        f.write("{}\n".format(Config.export_description))

    log_info(
        "Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
            metadata_fname
        )
    )


def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(
        os.path.abspath(Config.export_dir), ""
    )  # Force ending '/'
    if is_remote_path(export_dir):
        log_error(
            "Cannot package remote path zip %s. Please do this manually." % export_dir
        )
        return

    zip_filename = os.path.dirname(export_dir)

    shutil.copy(Config.scorer_path, export_dir)

    archive = shutil.make_archive(zip_filename, "zip", export_dir)
    log_info("Exported packaged model {}".format(archive))


def do_single_file_inference(input_file_path):
    tfv1.reset_default_graph()

    with tfv1.Session(config=Config.session_config) as session:
        inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)

        # Restore variables from training checkpoint
        load_graph_for_evaluation(session)

        features, features_len = audiofile_to_features(input_file_path)
        previous_state_c = np.zeros([1, Config.n_cell_dim])
        previous_state_h = np.zeros([1, Config.n_cell_dim])

        # Add batch dimension
        features = tf.expand_dims(features, 0)
        features_len = tf.expand_dims(features_len, 0)

        # Evaluate
        features = create_overlapping_windows(features).eval(session=session)
        features_len = features_len.eval(session=session)

        probs = outputs["outputs"].eval(
            feed_dict={
                inputs["input"]: features,
                inputs["input_lengths"]: features_len,
                inputs["previous_state_c"]: previous_state_c,
                inputs["previous_state_h"]: previous_state_h,
            },
            session=session,
        )

        probs = np.squeeze(probs)

        if Config.scorer_path:
            scorer = Scorer(
                Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
            )
        else:
            scorer = None
        decoded = ctc_beam_search_decoder(
            probs,
            Config.alphabet,
            Config.beam_width,
            scorer=scorer,
            cutoff_prob=Config.cutoff_prob,
            cutoff_top_n=Config.cutoff_top_n,
        )
        # Print highest probability result
        print(decoded[0][1])


def early_training_checks():
    # Check for proper scorer early
    if Config.scorer_path:
        scorer = Scorer(
            Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
        )
        del scorer

    if (
        Config.train_files
        and Config.test_files
        and Config.load_checkpoint_dir != Config.save_checkpoint_dir
    ):
        log_warn(
            "WARNING: You specified different values for --load_checkpoint_dir "
            "and --save_checkpoint_dir, but you are running training and testing "
            "in a single invocation. The testing step will respect --load_checkpoint_dir, "
            "and thus WILL NOT TEST THE CHECKPOINT CREATED BY THE TRAINING STEP. "
            "Train and test in two separate invocations, specifying the correct "
            "--load_checkpoint_dir in both cases, or use the same location "
            "for loading and saving."
        )

    if not Config.alphabet_config_path and not Config.bytes_output_mode:
        log_error("Missing --alphabet_config_path flag, can't continue")
        sys.exit(1)


def main():
    initialize_globals_from_cli()
    early_training_checks()

    if Config.train_files:
        train()

    if Config.test_files:
        test()

    if Config.export_dir and not Config.export_zip:
        export()

    if Config.export_zip:
        Config.export_tflite = True

        if listdir_remote(Config.export_dir):
            log_error(
                "Directory {} is not empty, please fix this.".format(Config.export_dir)
            )
            sys.exit(1)

        export()
        package_zip()

    if Config.one_shot_infer:
        do_single_file_inference(Config.one_shot_infer)


if __name__ == "__main__":
    main()