Merge pull request #1946 from coqui-ai/training-submodules

Split train.py into separate modules
This commit is contained in:
Reuben Morais 2021-08-25 19:37:53 +02:00 committed by GitHub
commit 5afe3c6e59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 780 additions and 657 deletions

View File

@ -28,15 +28,15 @@ runs:
case "${{ inputs.pyver }}" in case "${{ inputs.pyver }}" in
3.7*) 3.7*)
NUMPY_BUILD_VERSION="==1.14.5" NUMPY_BUILD_VERSION="==1.14.5"
NUMPY_DEP_VERSION=">=1.14.5" NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4"
;; ;;
3.8*) 3.8*)
NUMPY_BUILD_VERSION="==1.17.3" NUMPY_BUILD_VERSION="==1.17.3"
NUMPY_DEP_VERSION=">=1.17.3" NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4"
;; ;;
3.9*) 3.9*)
NUMPY_BUILD_VERSION="==1.19.4" NUMPY_BUILD_VERSION="==1.19.4"
NUMPY_DEP_VERSION=">=1.19.4" NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
;; ;;
esac esac
;; ;;
@ -57,7 +57,7 @@ runs:
;; ;;
3.9*) 3.9*)
NUMPY_BUILD_VERSION="==1.19.4" NUMPY_BUILD_VERSION="==1.19.4"
NUMPY_DEP_VERSION=">=1.19.4" NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
;; ;;
esac esac
;; ;;
@ -82,7 +82,7 @@ runs:
;; ;;
3.9*) 3.9*)
NUMPY_BUILD_VERSION="==1.19.4" NUMPY_BUILD_VERSION="==1.19.4"
NUMPY_DEP_VERSION=">=1.19.4" NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
;; ;;
esac esac
;; ;;

View File

@ -27,4 +27,5 @@ python -u train.py --alphabet_config_path "data/alphabet.txt" \
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \ --max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
--scorer_path 'data/smoke_test/pruned_lm.scorer' \ --scorer_path 'data/smoke_test/pruned_lm.scorer' \
--audio_sample_rate ${audio_sample_rate} --audio_sample_rate ${audio_sample_rate} \
--export_tflite false

View File

@ -27,4 +27,5 @@ python -u train.py --show_progressbar false --early_stop false \
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \ --learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \ --scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
--audio_sample_rate ${audio_sample_rate} \ --audio_sample_rate ${audio_sample_rate} \
--bytes_output_mode true --bytes_output_mode true \
--export_tflite false

View File

@ -2,7 +2,8 @@
import os import os
from import_ldc93s1 import _download_and_preprocess_data as download_ldc from import_ldc93s1 import _download_and_preprocess_data as download_ldc
from coqui_stt_training.util.config import initialize_globals_from_args from coqui_stt_training.util.config import initialize_globals_from_args
from coqui_stt_training.train import train, test from coqui_stt_training.train import train
from coqui_stt_training.evaluate import test
# only one GPU for only one training sample # only one GPU for only one training sample
os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@ -21,5 +22,4 @@ initialize_globals_from_args(
) )
train() train()
test() test()

View File

@ -18,6 +18,7 @@ def main():
"coqpit", "coqpit",
"numpy", "numpy",
"optuna", "optuna",
"numba <= 0.53.1",
"opuslib == 2.0.0", "opuslib == 2.0.0",
"pandas", "pandas",
"progressbar2", "progressbar2",

View File

@ -0,0 +1,403 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
DESIRED_LOG_LEVEL = (
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
import numpy as np
import tensorflow as tf
import tensorflow.compat.v1 as tfv1
tfv1.logging.set_verbosity(
{
"0": tfv1.logging.DEBUG,
"1": tfv1.logging.INFO,
"2": tfv1.logging.WARN,
"3": tfv1.logging.ERROR,
}.get(DESIRED_LOG_LEVEL)
)
from .util.config import Config
from .util.feeding import audio_to_features
def variable_on_cpu(name, shape, initializer):
r"""
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_cpu()``
used to create a variable in CPU memory.
"""
# Use the /cpu:0 device for scoped operations
with tf.device(Config.cpu_device):
# Create or get apropos variable
var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
return var
def create_overlapping_windows(batch_x):
batch_size = tf.shape(input=batch_x)[0]
window_width = 2 * Config.n_context + 1
num_channels = Config.n_input
# Create a constant convolution filter using an identity matrix, so that the
# convolution returns patches of the input tensor as is, and we can create
# overlapping windows over the MFCCs.
eye_filter = tf.constant(
np.eye(window_width * num_channels).reshape(
window_width, num_channels, window_width * num_channels
),
tf.float32,
) # pylint: disable=bad-continuation
# Create overlapping windows
batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
# Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
return batch_x
def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
with tfv1.variable_scope(name):
bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
weights = variable_on_cpu(
"weights",
[x.shape[-1], units],
tfv1.keras.initializers.VarianceScaling(
scale=1.0, mode="fan_avg", distribution="uniform"
),
)
output = tf.nn.bias_add(tf.matmul(x, weights), bias)
if relu:
output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
if layer_norm:
with tfv1.variable_scope(name):
output = tf.contrib.layers.layer_norm(output)
if dropout_rate is not None:
output = tf.nn.dropout(output, rate=dropout_rate)
return output
def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
Config.n_cell_dim,
forget_bias=0,
reuse=reuse,
name="cudnn_compatible_lstm_cell",
)
output, output_state = fw_cell(
inputs=x,
dtype=tf.float32,
sequence_length=seq_length,
initial_state=previous_state,
)
return output, output_state
def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
assert (
previous_state is None
) # 'Passing previous state not supported with CuDNN backend'
# Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
# the object it creates the variables, and then you just call it several times
# to enable variable re-use. Because all of our code is structure in an old
# school TensorFlow structure where you can just call tf.get_variable again with
# reuse=True to reuse variables, we can't easily make use of the object oriented
# way CudnnLSTM is implemented, so we save a singleton instance in the function,
# emulating a static function variable.
if not rnn_impl_cudnn_rnn.cell:
# Forward direction cell:
fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=1,
num_units=Config.n_cell_dim,
input_mode="linear_input",
direction="unidirectional",
dtype=tf.float32,
)
rnn_impl_cudnn_rnn.cell = fw_cell
output, output_state = rnn_impl_cudnn_rnn.cell(
inputs=x, sequence_lengths=seq_length
)
return output, output_state
rnn_impl_cudnn_rnn.cell = None
def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
# Forward direction cell:
fw_cell = tfv1.nn.rnn_cell.LSTMCell(
Config.n_cell_dim,
forget_bias=0,
reuse=reuse,
name="cudnn_compatible_lstm_cell",
)
# Split rank N tensor into list of rank N-1 tensors
x = [x[l] for l in range(x.shape[0])]
output, output_state = tfv1.nn.static_rnn(
cell=fw_cell,
inputs=x,
sequence_length=seq_length,
initial_state=previous_state,
dtype=tf.float32,
scope="cell_0",
)
output = tf.concat(output, 0)
return output, output_state
def create_model(
batch_x,
seq_length,
dropout,
reuse=False,
batch_size=None,
previous_state=None,
overlap=True,
rnn_impl=rnn_impl_lstmblockfusedcell,
):
layers = {}
# Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
if not batch_size:
batch_size = tf.shape(input=batch_x)[0]
# Create overlapping feature windows if needed
if overlap:
batch_x = create_overlapping_windows(batch_x)
# Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
# This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
# Permute n_steps and batch_size
batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
# Reshape to prepare input for first layer
batch_x = tf.reshape(
batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
) # (n_steps*batch_size, n_input + 2*n_input*n_context)
layers["input_reshaped"] = batch_x
# The next three blocks will pass `batch_x` through three hidden layers with
# clipped RELU activation and dropout.
layers["layer_1"] = layer_1 = dense(
"layer_1",
batch_x,
Config.n_hidden_1,
dropout_rate=dropout[0],
layer_norm=Config.layer_norm,
)
layers["layer_2"] = layer_2 = dense(
"layer_2",
layer_1,
Config.n_hidden_2,
dropout_rate=dropout[1],
layer_norm=Config.layer_norm,
)
layers["layer_3"] = layer_3 = dense(
"layer_3",
layer_2,
Config.n_hidden_3,
dropout_rate=dropout[2],
layer_norm=Config.layer_norm,
)
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
# Run through parametrized RNN implementation, as we use different RNNs
# for training and inference
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
# Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
# to a tensor of shape [n_steps*batch_size, n_cell_dim]
output = tf.reshape(output, [-1, Config.n_cell_dim])
layers["rnn_output"] = output
layers["rnn_output_state"] = output_state
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
layers["layer_5"] = layer_5 = dense(
"layer_5",
output,
Config.n_hidden_5,
dropout_rate=dropout[5],
layer_norm=Config.layer_norm,
)
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
layers["layer_6"] = layer_6 = dense(
"layer_6", layer_5, Config.n_hidden_6, relu=False
)
# Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
# to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
# Note, that this differs from the input in that it is time-major.
layer_6 = tf.reshape(
layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
)
layers["raw_logits"] = layer_6
# Output shape: [n_steps, batch_size, n_hidden_6]
return layer_6, layers
def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
batch_size = batch_size if batch_size > 0 else None
# Create feature computation graph
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
input_samples = tfv1.placeholder(
tf.float32, [Config.audio_window_samples], "input_samples"
)
samples = tf.expand_dims(input_samples, -1)
mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
mfccs = tf.identity(mfccs, name="mfccs")
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
# This shape is read by the native_client in STT_CreateModel to know the
# value of n_steps, n_context and n_input. Make sure you update the code
# there if this shape is changed.
#
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
input_tensor = tfv1.placeholder(
tf.float32,
[
batch_size,
n_steps if n_steps > 0 else None,
2 * Config.n_context + 1,
Config.n_input,
],
name="input_node",
)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
if batch_size <= 0:
# no state management since n_step is expected to be dynamic too (see below)
previous_state = None
else:
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
previous_state_c = tfv1.placeholder(
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
previous_state_h = tfv1.placeholder(
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
)
previous_state = tf.nn.rnn_cell.LSTMStateTuple(
previous_state_c, previous_state_h
)
# One rate per layer
no_dropout = [None] * 6
if tflite:
rnn_impl = rnn_impl_static_rnn
else:
rnn_impl = rnn_impl_lstmblockfusedcell
logits, layers = create_model(
batch_x=input_tensor,
batch_size=batch_size,
seq_length=seq_length if not Config.export_tflite else None,
dropout=no_dropout,
previous_state=previous_state,
overlap=False,
rnn_impl=rnn_impl,
)
# TF Lite runtime will check that input dimensions are 1, 2 or 4
# by default we get 3, the middle one being batch_size which is forced to
# one on inference graph, so remove that dimension
#
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
if tflite:
logits = tf.squeeze(logits, [1])
# Apply softmax for CTC decoder
probs = tf.nn.softmax(logits, name="logits")
if batch_size <= 0:
if tflite:
raise NotImplementedError(
"dynamic batch_size does not support tflite nor streaming"
)
if n_steps > 0:
raise NotImplementedError(
"dynamic batch_size expect n_steps to be dynamic too"
)
return (
{
"input": input_tensor,
"input_lengths": seq_length,
},
{
"outputs": probs,
},
layers,
)
new_state_c, new_state_h = layers["rnn_output_state"]
new_state_c = tf.identity(new_state_c, name="new_state_c")
new_state_h = tf.identity(new_state_h, name="new_state_h")
inputs = {
"input": input_tensor,
"previous_state_c": previous_state_c,
"previous_state_h": previous_state_h,
"input_samples": input_samples,
}
if not Config.export_tflite:
inputs["input_lengths"] = seq_length
outputs = {
"outputs": probs,
"new_state_c": new_state_c,
"new_state_h": new_state_h,
"mfccs": mfccs,
# Expose internal layers for downstream applications
"layer_3": layers["layer_3"],
"layer_5": layers["layer_5"],
}
return inputs, outputs, layers

29
training/coqui_stt_training/evaluate.py Executable file → Normal file
View File

@ -13,6 +13,7 @@ from six.moves import zip
import tensorflow as tf import tensorflow as tf
from .deepspeech_model import create_model
from .util.augmentations import NormalizeSampleRate from .util.augmentations import NormalizeSampleRate
from .util.checkpoints import load_graph_for_evaluation from .util.checkpoints import load_graph_for_evaluation
from .util.config import ( from .util.config import (
@ -168,25 +169,25 @@ def evaluate(test_csvs, create_model):
return samples return samples
def main(): def test():
initialize_globals_from_cli() tfv1.reset_default_graph()
if not Config.test_files:
log_error(
"You need to specify what files to use for evaluation via "
"the --test_files flag."
)
sys.exit(1)
from .train import ( # pylint: disable=cyclic-import,import-outside-toplevel
create_model,
)
samples = evaluate(Config.test_files, create_model) samples = evaluate(Config.test_files, create_model)
if Config.test_output_file: if Config.test_output_file:
save_samples_json(samples, Config.test_output_file) save_samples_json(samples, Config.test_output_file)
def main():
initialize_globals_from_cli()
if not Config.test_files:
raise RuntimeError(
"You need to specify what files to use for evaluation via "
"the --test_files flag."
)
test()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -0,0 +1,216 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
DESIRED_LOG_LEVEL = (
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
import tensorflow as tf
import tensorflow.compat.v1 as tfv1
import shutil
from .deepspeech_model import create_inference_graph
from .util.checkpoints import load_graph_for_evaluation
from .util.config import Config, initialize_globals_from_cli, log_error, log_info
from .util.io import (
open_remote,
rmtree_remote,
listdir_remote,
is_remote_path,
isdir_remote,
)
def file_relative_read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
def export():
r"""
Restores the trained variables into a simpler graph that will be exported for serving.
"""
log_info("Exporting the model...")
tfv1.reset_default_graph()
inputs, outputs, _ = create_inference_graph(
batch_size=Config.export_batch_size,
n_steps=Config.n_steps,
tflite=Config.export_tflite,
)
graph_version = int(file_relative_read("GRAPH_VERSION").strip())
assert graph_version > 0
# native_client: these nodes's names and shapes are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
outputs["metadata_sample_rate"] = tf.constant(
[Config.audio_sample_rate], name="metadata_sample_rate"
)
outputs["metadata_feature_win_len"] = tf.constant(
[Config.feature_win_len], name="metadata_feature_win_len"
)
outputs["metadata_feature_win_step"] = tf.constant(
[Config.feature_win_step], name="metadata_feature_win_step"
)
outputs["metadata_beam_width"] = tf.constant(
[Config.export_beam_width], name="metadata_beam_width"
)
outputs["metadata_alphabet"] = tf.constant(
[Config.alphabet.Serialize()], name="metadata_alphabet"
)
if Config.export_language:
outputs["metadata_language"] = tf.constant(
[Config.export_language.encode("utf-8")], name="metadata_language"
)
# Prevent further graph changes
tfv1.get_default_graph().finalize()
output_names_tensors = [
tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
]
output_names_ops = [
op.name for op in outputs.values() if isinstance(op, tf.Operation)
]
output_names = output_names_tensors + output_names_ops
with tf.Session() as session:
# Restore variables from checkpoint
load_graph_for_evaluation(session)
output_filename = Config.export_file_name + ".pb"
if Config.remove_export:
if isdir_remote(Config.export_dir):
log_info("Removing old export")
rmtree_remote(Config.export_dir)
output_graph_path = os.path.join(Config.export_dir, output_filename)
if not is_remote_path(Config.export_dir) and not os.path.isdir(
Config.export_dir
):
os.makedirs(Config.export_dir)
frozen_graph = tfv1.graph_util.convert_variables_to_constants(
sess=session,
input_graph_def=tfv1.get_default_graph().as_graph_def(),
output_node_names=output_names,
)
frozen_graph = tfv1.graph_util.extract_sub_graph(
graph_def=frozen_graph, dest_nodes=output_names
)
if not Config.export_tflite:
with open_remote(output_graph_path, "wb") as fout:
fout.write(frozen_graph.SerializeToString())
else:
output_tflite_path = os.path.join(
Config.export_dir, output_filename.replace(".pb", ".tflite")
)
converter = tf.lite.TFLiteConverter(
frozen_graph,
input_tensors=inputs.values(),
output_tensors=outputs.values(),
)
if Config.export_quantize:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
converter.allow_custom_ops = True
tflite_model = converter.convert()
with open_remote(output_tflite_path, "wb") as fout:
fout.write(tflite_model)
log_info("Models exported at %s" % (Config.export_dir))
metadata_fname = os.path.join(
Config.export_dir,
"{}_{}_{}.md".format(
Config.export_author_id,
Config.export_model_name,
Config.export_model_version,
),
)
model_runtime = "tflite" if Config.export_tflite else "tensorflow"
with open_remote(metadata_fname, "w") as f:
f.write("---\n")
f.write("author: {}\n".format(Config.export_author_id))
f.write("model_name: {}\n".format(Config.export_model_name))
f.write("model_version: {}\n".format(Config.export_model_version))
f.write("contact_info: {}\n".format(Config.export_contact_info))
f.write("license: {}\n".format(Config.export_license))
f.write("language: {}\n".format(Config.export_language))
f.write("runtime: {}\n".format(model_runtime))
f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
f.write(
"acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
)
f.write(
"scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
)
f.write("---\n")
f.write("{}\n".format(Config.export_description))
log_info(
"Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
metadata_fname
)
)
def package_zip():
# --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
export_dir = os.path.join(
os.path.abspath(Config.export_dir), ""
) # Force ending '/'
if is_remote_path(export_dir):
log_error(
"Cannot package remote path zip %s. Please do this manually." % export_dir
)
return
zip_filename = os.path.dirname(export_dir)
shutil.copy(Config.scorer_path, export_dir)
archive = shutil.make_archive(zip_filename, "zip", export_dir)
log_info("Exported packaged model {}".format(archive))
def main(_):
initialize_globals_from_cli()
if not Config.export_dir:
raise RuntimeError(
"Calling export script directly but no --export_dir specified"
)
if not Config.export_zip:
# Export to folder
export()
else:
if listdir_remote(Config.export_dir):
raise RuntimeError(
"Directory {} is not empty, please fix this.".format(Config.export_dir)
)
export()
package_zip()
if __name__ == "__main__":
main()

View File

@ -14,12 +14,13 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
import json import json
import shutil import shutil
import time import time
from datetime import datetime
import numpy as np import numpy as np
import progressbar import progressbar
import tensorflow.compat.v1 as tfv1 import tensorflow.compat.v1 as tfv1
import tensorflow as tf import tensorflow as tf
from coqui_stt_ctcdecoder import Scorer
tfv1.logging.set_verbosity( tfv1.logging.set_verbosity(
{ {
@ -30,12 +31,15 @@ tfv1.logging.set_verbosity(
}.get(DESIRED_LOG_LEVEL) }.get(DESIRED_LOG_LEVEL)
) )
from datetime import datetime
from coqui_stt_ctcdecoder import Scorer, ctc_beam_search_decoder from . import evaluate
from six.moves import range, zip from . import export
from . import training_graph_inference
from .evaluate import evaluate from .deepspeech_model import (
create_model,
rnn_impl_lstmblockfusedcell,
rnn_impl_cudnn_rnn,
)
from .util.augmentations import NormalizeSampleRate from .util.augmentations import NormalizeSampleRate
from .util.checkpoints import ( from .util.checkpoints import (
load_graph_for_evaluation, load_graph_for_evaluation,
@ -52,260 +56,16 @@ from .util.config import (
log_progress, log_progress,
log_warn, log_warn,
) )
from .util.evaluate_tools import save_samples_json from .util.feeding import create_dataset
from .util.feeding import audio_to_features, audiofile_to_features, create_dataset
from .util.helpers import ExceptionBox, check_ctcdecoder_version from .util.helpers import ExceptionBox, check_ctcdecoder_version
from .util.io import ( from .util.io import (
is_remote_path, is_remote_path,
isdir_remote,
listdir_remote,
open_remote, open_remote,
remove_remote, remove_remote,
) )
check_ctcdecoder_version() check_ctcdecoder_version()
# Graph Creation
# ==============
def variable_on_cpu(name, shape, initializer):
r"""
Next we concern ourselves with graph creation.
However, before we do so we must introduce a utility function ``variable_on_cpu()``
used to create a variable in CPU memory.
"""
# Use the /cpu:0 device for scoped operations
with tf.device(Config.cpu_device):
# Create or get apropos variable
var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
return var
def create_overlapping_windows(batch_x):
batch_size = tf.shape(input=batch_x)[0]
window_width = 2 * Config.n_context + 1
num_channels = Config.n_input
# Create a constant convolution filter using an identity matrix, so that the
# convolution returns patches of the input tensor as is, and we can create
# overlapping windows over the MFCCs.
eye_filter = tf.constant(
np.eye(window_width * num_channels).reshape(
window_width, num_channels, window_width * num_channels
),
tf.float32,
) # pylint: disable=bad-continuation
# Create overlapping windows
batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
# Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
return batch_x
def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
with tfv1.variable_scope(name):
bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
weights = variable_on_cpu(
"weights",
[x.shape[-1], units],
tfv1.keras.initializers.VarianceScaling(
scale=1.0, mode="fan_avg", distribution="uniform"
),
)
output = tf.nn.bias_add(tf.matmul(x, weights), bias)
if relu:
output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
if layer_norm:
with tfv1.variable_scope(name):
output = tf.contrib.layers.layer_norm(output)
if dropout_rate is not None:
output = tf.nn.dropout(output, rate=dropout_rate)
return output
def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
Config.n_cell_dim,
forget_bias=0,
reuse=reuse,
name="cudnn_compatible_lstm_cell",
)
output, output_state = fw_cell(
inputs=x,
dtype=tf.float32,
sequence_length=seq_length,
initial_state=previous_state,
)
return output, output_state
def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
assert (
previous_state is None
) # 'Passing previous state not supported with CuDNN backend'
# Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
# the object it creates the variables, and then you just call it several times
# to enable variable re-use. Because all of our code is structure in an old
# school TensorFlow structure where you can just call tf.get_variable again with
# reuse=True to reuse variables, we can't easily make use of the object oriented
# way CudnnLSTM is implemented, so we save a singleton instance in the function,
# emulating a static function variable.
if not rnn_impl_cudnn_rnn.cell:
# Forward direction cell:
fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=1,
num_units=Config.n_cell_dim,
input_mode="linear_input",
direction="unidirectional",
dtype=tf.float32,
)
rnn_impl_cudnn_rnn.cell = fw_cell
output, output_state = rnn_impl_cudnn_rnn.cell(
inputs=x, sequence_lengths=seq_length
)
return output, output_state
rnn_impl_cudnn_rnn.cell = None
def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
# Forward direction cell:
fw_cell = tfv1.nn.rnn_cell.LSTMCell(
Config.n_cell_dim,
forget_bias=0,
reuse=reuse,
name="cudnn_compatible_lstm_cell",
)
# Split rank N tensor into list of rank N-1 tensors
x = [x[l] for l in range(x.shape[0])]
output, output_state = tfv1.nn.static_rnn(
cell=fw_cell,
inputs=x,
sequence_length=seq_length,
initial_state=previous_state,
dtype=tf.float32,
scope="cell_0",
)
output = tf.concat(output, 0)
return output, output_state
def create_model(
batch_x,
seq_length,
dropout,
reuse=False,
batch_size=None,
previous_state=None,
overlap=True,
rnn_impl=rnn_impl_lstmblockfusedcell,
):
layers = {}
# Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
if not batch_size:
batch_size = tf.shape(input=batch_x)[0]
# Create overlapping feature windows if needed
if overlap:
batch_x = create_overlapping_windows(batch_x)
# Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
# This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
# Permute n_steps and batch_size
batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
# Reshape to prepare input for first layer
batch_x = tf.reshape(
batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
) # (n_steps*batch_size, n_input + 2*n_input*n_context)
layers["input_reshaped"] = batch_x
# The next three blocks will pass `batch_x` through three hidden layers with
# clipped RELU activation and dropout.
layers["layer_1"] = layer_1 = dense(
"layer_1",
batch_x,
Config.n_hidden_1,
dropout_rate=dropout[0],
layer_norm=Config.layer_norm,
)
layers["layer_2"] = layer_2 = dense(
"layer_2",
layer_1,
Config.n_hidden_2,
dropout_rate=dropout[1],
layer_norm=Config.layer_norm,
)
layers["layer_3"] = layer_3 = dense(
"layer_3",
layer_2,
Config.n_hidden_3,
dropout_rate=dropout[2],
layer_norm=Config.layer_norm,
)
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
# Run through parametrized RNN implementation, as we use different RNNs
# for training and inference
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
# Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
# to a tensor of shape [n_steps*batch_size, n_cell_dim]
output = tf.reshape(output, [-1, Config.n_cell_dim])
layers["rnn_output"] = output
layers["rnn_output_state"] = output_state
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
layers["layer_5"] = layer_5 = dense(
"layer_5",
output,
Config.n_hidden_5,
dropout_rate=dropout[5],
layer_norm=Config.layer_norm,
)
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
layers["layer_6"] = layer_6 = dense(
"layer_6", layer_5, Config.n_hidden_6, relu=False
)
# Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
# to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
# Note, that this differs from the input in that it is time-major.
layer_6 = tf.reshape(
layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
)
layers["raw_logits"] = layer_6
# Output shape: [n_steps, batch_size, n_hidden_6]
return layer_6, layers
# Accuracy and Loss # Accuracy and Loss
# ================= # =================
@ -900,371 +660,6 @@ def train():
log_debug("Session closed.") log_debug("Session closed.")
def test():
tfv1.reset_default_graph()
samples = evaluate(Config.test_files, create_model)
if Config.test_output_file:
save_samples_json(samples, Config.test_output_file)
def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
batch_size = batch_size if batch_size > 0 else None
# Create feature computation graph
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
input_samples = tfv1.placeholder(
tf.float32, [Config.audio_window_samples], "input_samples"
)
samples = tf.expand_dims(input_samples, -1)
mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
mfccs = tf.identity(mfccs, name="mfccs")
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
# This shape is read by the native_client in STT_CreateModel to know the
# value of n_steps, n_context and n_input. Make sure you update the code
# there if this shape is changed.
#
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
input_tensor = tfv1.placeholder(
tf.float32,
[
batch_size,
n_steps if n_steps > 0 else None,
2 * Config.n_context + 1,
Config.n_input,
],
name="input_node",
)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
if batch_size <= 0:
# no state management since n_step is expected to be dynamic too (see below)
previous_state = None
else:
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
previous_state_c = tfv1.placeholder(
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
)
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
previous_state_h = tfv1.placeholder(
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
)
previous_state = tf.nn.rnn_cell.LSTMStateTuple(
previous_state_c, previous_state_h
)
# One rate per layer
no_dropout = [None] * 6
if tflite:
rnn_impl = rnn_impl_static_rnn
else:
rnn_impl = rnn_impl_lstmblockfusedcell
logits, layers = create_model(
batch_x=input_tensor,
batch_size=batch_size,
seq_length=seq_length if not Config.export_tflite else None,
dropout=no_dropout,
previous_state=previous_state,
overlap=False,
rnn_impl=rnn_impl,
)
# TF Lite runtime will check that input dimensions are 1, 2 or 4
# by default we get 3, the middle one being batch_size which is forced to
# one on inference graph, so remove that dimension
#
# native_client: this node's name and shape are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
if tflite:
logits = tf.squeeze(logits, [1])
# Apply softmax for CTC decoder
probs = tf.nn.softmax(logits, name="logits")
if batch_size <= 0:
if tflite:
raise NotImplementedError(
"dynamic batch_size does not support tflite nor streaming"
)
if n_steps > 0:
raise NotImplementedError(
"dynamic batch_size expect n_steps to be dynamic too"
)
return (
{
"input": input_tensor,
"input_lengths": seq_length,
},
{
"outputs": probs,
},
layers,
)
new_state_c, new_state_h = layers["rnn_output_state"]
new_state_c = tf.identity(new_state_c, name="new_state_c")
new_state_h = tf.identity(new_state_h, name="new_state_h")
inputs = {
"input": input_tensor,
"previous_state_c": previous_state_c,
"previous_state_h": previous_state_h,
"input_samples": input_samples,
}
if not Config.export_tflite:
inputs["input_lengths"] = seq_length
outputs = {
"outputs": probs,
"new_state_c": new_state_c,
"new_state_h": new_state_h,
"mfccs": mfccs,
# Expose internal layers for downstream applications
"layer_3": layers["layer_3"],
"layer_5": layers["layer_5"],
}
return inputs, outputs, layers
def file_relative_read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
def export():
r"""
Restores the trained variables into a simpler graph that will be exported for serving.
"""
log_info("Exporting the model...")
tfv1.reset_default_graph()
inputs, outputs, _ = create_inference_graph(
batch_size=Config.export_batch_size,
n_steps=Config.n_steps,
tflite=Config.export_tflite,
)
graph_version = int(file_relative_read("GRAPH_VERSION").strip())
assert graph_version > 0
# native_client: these nodes's names and shapes are part of the API boundary
# with the native client, if you change them you should sync changes with
# the C++ code.
outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
outputs["metadata_sample_rate"] = tf.constant(
[Config.audio_sample_rate], name="metadata_sample_rate"
)
outputs["metadata_feature_win_len"] = tf.constant(
[Config.feature_win_len], name="metadata_feature_win_len"
)
outputs["metadata_feature_win_step"] = tf.constant(
[Config.feature_win_step], name="metadata_feature_win_step"
)
outputs["metadata_beam_width"] = tf.constant(
[Config.export_beam_width], name="metadata_beam_width"
)
outputs["metadata_alphabet"] = tf.constant(
[Config.alphabet.Serialize()], name="metadata_alphabet"
)
if Config.export_language:
outputs["metadata_language"] = tf.constant(
[Config.export_language.encode("utf-8")], name="metadata_language"
)
# Prevent further graph changes
tfv1.get_default_graph().finalize()
output_names_tensors = [
tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
]
output_names_ops = [
op.name for op in outputs.values() if isinstance(op, tf.Operation)
]
output_names = output_names_tensors + output_names_ops
with tf.Session() as session:
# Restore variables from checkpoint
load_graph_for_evaluation(session)
output_filename = Config.export_file_name + ".pb"
if Config.remove_export:
if isdir_remote(Config.export_dir):
log_info("Removing old export")
remove_remote(Config.export_dir)
output_graph_path = os.path.join(Config.export_dir, output_filename)
if not is_remote_path(Config.export_dir) and not os.path.isdir(
Config.export_dir
):
os.makedirs(Config.export_dir)
frozen_graph = tfv1.graph_util.convert_variables_to_constants(
sess=session,
input_graph_def=tfv1.get_default_graph().as_graph_def(),
output_node_names=output_names,
)
frozen_graph = tfv1.graph_util.extract_sub_graph(
graph_def=frozen_graph, dest_nodes=output_names
)
if not Config.export_tflite:
with open_remote(output_graph_path, "wb") as fout:
fout.write(frozen_graph.SerializeToString())
else:
output_tflite_path = os.path.join(
Config.export_dir, output_filename.replace(".pb", ".tflite")
)
converter = tf.lite.TFLiteConverter(
frozen_graph,
input_tensors=inputs.values(),
output_tensors=outputs.values(),
)
if Config.export_quantize:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
converter.allow_custom_ops = True
tflite_model = converter.convert()
with open_remote(output_tflite_path, "wb") as fout:
fout.write(tflite_model)
log_info("Models exported at %s" % (Config.export_dir))
metadata_fname = os.path.join(
Config.export_dir,
"{}_{}_{}.md".format(
Config.export_author_id,
Config.export_model_name,
Config.export_model_version,
),
)
model_runtime = "tflite" if Config.export_tflite else "tensorflow"
with open_remote(metadata_fname, "w") as f:
f.write("---\n")
f.write("author: {}\n".format(Config.export_author_id))
f.write("model_name: {}\n".format(Config.export_model_name))
f.write("model_version: {}\n".format(Config.export_model_version))
f.write("contact_info: {}\n".format(Config.export_contact_info))
f.write("license: {}\n".format(Config.export_license))
f.write("language: {}\n".format(Config.export_language))
f.write("runtime: {}\n".format(model_runtime))
f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
f.write(
"acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
)
f.write(
"scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
)
f.write("---\n")
f.write("{}\n".format(Config.export_description))
log_info(
"Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
metadata_fname
)
)
def package_zip():
# --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
export_dir = os.path.join(
os.path.abspath(Config.export_dir), ""
) # Force ending '/'
if is_remote_path(export_dir):
log_error(
"Cannot package remote path zip %s. Please do this manually." % export_dir
)
return
zip_filename = os.path.dirname(export_dir)
shutil.copy(Config.scorer_path, export_dir)
archive = shutil.make_archive(zip_filename, "zip", export_dir)
log_info("Exported packaged model {}".format(archive))
def do_single_file_inference(input_file_path):
tfv1.reset_default_graph()
with tfv1.Session(config=Config.session_config) as session:
inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
# Restore variables from training checkpoint
load_graph_for_evaluation(session)
features, features_len = audiofile_to_features(input_file_path)
previous_state_c = np.zeros([1, Config.n_cell_dim])
previous_state_h = np.zeros([1, Config.n_cell_dim])
# Add batch dimension
features = tf.expand_dims(features, 0)
features_len = tf.expand_dims(features_len, 0)
# Evaluate
features = create_overlapping_windows(features).eval(session=session)
features_len = features_len.eval(session=session)
probs = outputs["outputs"].eval(
feed_dict={
inputs["input"]: features,
inputs["input_lengths"]: features_len,
inputs["previous_state_c"]: previous_state_c,
inputs["previous_state_h"]: previous_state_h,
},
session=session,
)
probs = np.squeeze(probs)
if Config.scorer_path:
scorer = Scorer(
Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
)
else:
scorer = None
decoded = ctc_beam_search_decoder(
probs,
Config.alphabet,
Config.beam_width,
scorer=scorer,
cutoff_prob=Config.cutoff_prob,
cutoff_top_n=Config.cutoff_top_n,
)
# Print highest probability result
print(decoded[0][1])
def early_training_checks(): def early_training_checks():
# Check for proper scorer early # Check for proper scorer early
if Config.scorer_path: if Config.scorer_path:
@ -1289,36 +684,47 @@ def early_training_checks():
) )
if not Config.alphabet_config_path and not Config.bytes_output_mode: if not Config.alphabet_config_path and not Config.bytes_output_mode:
log_error("Missing --alphabet_config_path flag, can't continue") raise RuntimeError("Missing --alphabet_config_path flag, can't continue")
sys.exit(1)
def main(): def main():
initialize_globals_from_cli() initialize_globals_from_cli()
def deprecated_msg(prefix):
return (
f"{prefix} Using the training script as a generic driver for all training "
"related functionality is deprecated and will be removed soon. Use "
"the specific scripts: train.py/evaluate.py/export.py/training_graph_inference.py."
)
if Config.train_files: if Config.train_files:
train() train()
else:
log_warn(deprecated_msg("Calling training script without --train_files."))
if Config.test_files: if Config.test_files:
test() log_warn(
deprecated_msg(
if Config.export_dir and not Config.export_zip: "Specifying --test_files when calling train.py script. Use evaluate.py."
export()
if Config.export_zip:
Config.export_tflite = True
if listdir_remote(Config.export_dir):
log_error(
"Directory {} is not empty, please fix this.".format(Config.export_dir)
) )
sys.exit(1) )
evaluate.test()
export() if Config.export_dir:
package_zip() log_warn(
deprecated_msg(
"Specifying --export_dir when calling train.py script. Use export.py."
)
)
export.export()
if Config.one_shot_infer: if Config.one_shot_infer:
do_single_file_inference(Config.one_shot_infer) log_warn(
deprecated_msg(
"Specifying --one_shot_infer when calling train.py script. Use training_graph_inference.py."
)
)
traning_graph_inference.do_single_file_inference(Config.one_shot_infer)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -0,0 +1,87 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
DESIRED_LOG_LEVEL = (
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
import numpy as np
import tensorflow as tf
import tensorflow.compat.v1 as tfv1
from coqui_stt_ctcdecoder import ctc_beam_search_decoder, Scorer
from .deepspeech_model import create_inference_graph, create_overlapping_windows
from .util.checkpoints import load_graph_for_evaluation
from .util.config import Config, initialize_globals_from_cli, log_error
from .util.feeding import audiofile_to_features
def do_single_file_inference(input_file_path):
tfv1.reset_default_graph()
with tfv1.Session(config=Config.session_config) as session:
inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
# Restore variables from training checkpoint
load_graph_for_evaluation(session)
features, features_len = audiofile_to_features(input_file_path)
previous_state_c = np.zeros([1, Config.n_cell_dim])
previous_state_h = np.zeros([1, Config.n_cell_dim])
# Add batch dimension
features = tf.expand_dims(features, 0)
features_len = tf.expand_dims(features_len, 0)
# Evaluate
features = create_overlapping_windows(features).eval(session=session)
features_len = features_len.eval(session=session)
probs = outputs["outputs"].eval(
feed_dict={
inputs["input"]: features,
inputs["input_lengths"]: features_len,
inputs["previous_state_c"]: previous_state_c,
inputs["previous_state_h"]: previous_state_h,
},
session=session,
)
probs = np.squeeze(probs)
if Config.scorer_path:
scorer = Scorer(
Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
)
else:
scorer = None
decoded = ctc_beam_search_decoder(
probs,
Config.alphabet,
Config.beam_width,
scorer=scorer,
cutoff_prob=Config.cutoff_prob,
cutoff_top_n=Config.cutoff_top_n,
)
# Print highest probability result
print(decoded[0][1])
def main():
initialize_globals_from_cli()
if Config.one_shot_infer:
tfv1.reset_default_graph()
do_single_file_inference(Config.one_shot_infer)
else:
raise RuntimeError(
"Calling training_graph_inference script directly but no --one_shot_infer input audio file specified"
)
if __name__ == "__main__":
main()

View File

@ -477,7 +477,7 @@ class _SttConfig(Coqpit):
default=False, metadata=dict(help="whether to remove old exported models") default=False, metadata=dict(help="whether to remove old exported models")
) )
export_tflite: bool = field( export_tflite: bool = field(
default=False, metadata=dict(help="export a graph ready for TF Lite engine") default=True, metadata=dict(help="export a graph ready for TF Lite engine")
) )
export_quantize: bool = field( export_quantize: bool = field(
default=True, default=True,

View File

@ -90,3 +90,10 @@ def remove_remote(filename):
""" """
# Conditional import # Conditional import
return gfile.remove(filename) return gfile.remove(filename)
def rmtree_remote(foldername):
"""
Wrapper that can remove local and remote directories like `gs://...`
"""
return gfile.rmtree(foldername)