Merge pull request #1946 from coqui-ai/training-submodules
Split train.py into separate modules
This commit is contained in:
commit
5afe3c6e59
|
@ -28,15 +28,15 @@ runs:
|
||||||
case "${{ inputs.pyver }}" in
|
case "${{ inputs.pyver }}" in
|
||||||
3.7*)
|
3.7*)
|
||||||
NUMPY_BUILD_VERSION="==1.14.5"
|
NUMPY_BUILD_VERSION="==1.14.5"
|
||||||
NUMPY_DEP_VERSION=">=1.14.5"
|
NUMPY_DEP_VERSION=">=1.14.5,<=1.19.4"
|
||||||
;;
|
;;
|
||||||
3.8*)
|
3.8*)
|
||||||
NUMPY_BUILD_VERSION="==1.17.3"
|
NUMPY_BUILD_VERSION="==1.17.3"
|
||||||
NUMPY_DEP_VERSION=">=1.17.3"
|
NUMPY_DEP_VERSION=">=1.17.3,<=1.19.4"
|
||||||
;;
|
;;
|
||||||
3.9*)
|
3.9*)
|
||||||
NUMPY_BUILD_VERSION="==1.19.4"
|
NUMPY_BUILD_VERSION="==1.19.4"
|
||||||
NUMPY_DEP_VERSION=">=1.19.4"
|
NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
|
@ -57,7 +57,7 @@ runs:
|
||||||
;;
|
;;
|
||||||
3.9*)
|
3.9*)
|
||||||
NUMPY_BUILD_VERSION="==1.19.4"
|
NUMPY_BUILD_VERSION="==1.19.4"
|
||||||
NUMPY_DEP_VERSION=">=1.19.4"
|
NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
|
@ -82,7 +82,7 @@ runs:
|
||||||
;;
|
;;
|
||||||
3.9*)
|
3.9*)
|
||||||
NUMPY_BUILD_VERSION="==1.19.4"
|
NUMPY_BUILD_VERSION="==1.19.4"
|
||||||
NUMPY_DEP_VERSION=">=1.19.4"
|
NUMPY_DEP_VERSION=">=1.19.4,<=1.19.4"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
;;
|
;;
|
||||||
|
|
|
@ -27,4 +27,5 @@ python -u train.py --alphabet_config_path "data/alphabet.txt" \
|
||||||
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
--max_to_keep 1 --checkpoint_dir '/tmp/ckpt' \
|
||||||
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
|
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train' \
|
||||||
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
--scorer_path 'data/smoke_test/pruned_lm.scorer' \
|
||||||
--audio_sample_rate ${audio_sample_rate}
|
--audio_sample_rate ${audio_sample_rate} \
|
||||||
|
--export_tflite false
|
||||||
|
|
|
@ -27,4 +27,5 @@ python -u train.py --show_progressbar false --early_stop false \
|
||||||
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \
|
--learning_rate 0.001 --dropout_rate 0.05 --export_dir '/tmp/train_bytes' \
|
||||||
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
|
--scorer_path 'data/smoke_test/pruned_lm.bytes.scorer' \
|
||||||
--audio_sample_rate ${audio_sample_rate} \
|
--audio_sample_rate ${audio_sample_rate} \
|
||||||
--bytes_output_mode true
|
--bytes_output_mode true \
|
||||||
|
--export_tflite false
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
import os
|
import os
|
||||||
from import_ldc93s1 import _download_and_preprocess_data as download_ldc
|
from import_ldc93s1 import _download_and_preprocess_data as download_ldc
|
||||||
from coqui_stt_training.util.config import initialize_globals_from_args
|
from coqui_stt_training.util.config import initialize_globals_from_args
|
||||||
from coqui_stt_training.train import train, test
|
from coqui_stt_training.train import train
|
||||||
|
from coqui_stt_training.evaluate import test
|
||||||
|
|
||||||
# only one GPU for only one training sample
|
# only one GPU for only one training sample
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||||
|
@ -21,5 +22,4 @@ initialize_globals_from_args(
|
||||||
)
|
)
|
||||||
|
|
||||||
train()
|
train()
|
||||||
|
|
||||||
test()
|
test()
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -18,6 +18,7 @@ def main():
|
||||||
"coqpit",
|
"coqpit",
|
||||||
"numpy",
|
"numpy",
|
||||||
"optuna",
|
"optuna",
|
||||||
|
"numba <= 0.53.1",
|
||||||
"opuslib == 2.0.0",
|
"opuslib == 2.0.0",
|
||||||
"pandas",
|
"pandas",
|
||||||
"progressbar2",
|
"progressbar2",
|
||||||
|
|
|
@ -0,0 +1,403 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
|
||||||
|
DESIRED_LOG_LEVEL = (
|
||||||
|
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
|
||||||
|
)
|
||||||
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow.compat.v1 as tfv1
|
||||||
|
|
||||||
|
tfv1.logging.set_verbosity(
|
||||||
|
{
|
||||||
|
"0": tfv1.logging.DEBUG,
|
||||||
|
"1": tfv1.logging.INFO,
|
||||||
|
"2": tfv1.logging.WARN,
|
||||||
|
"3": tfv1.logging.ERROR,
|
||||||
|
}.get(DESIRED_LOG_LEVEL)
|
||||||
|
)
|
||||||
|
|
||||||
|
from .util.config import Config
|
||||||
|
from .util.feeding import audio_to_features
|
||||||
|
|
||||||
|
|
||||||
|
def variable_on_cpu(name, shape, initializer):
|
||||||
|
r"""
|
||||||
|
Next we concern ourselves with graph creation.
|
||||||
|
However, before we do so we must introduce a utility function ``variable_on_cpu()``
|
||||||
|
used to create a variable in CPU memory.
|
||||||
|
"""
|
||||||
|
# Use the /cpu:0 device for scoped operations
|
||||||
|
with tf.device(Config.cpu_device):
|
||||||
|
# Create or get apropos variable
|
||||||
|
var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
|
||||||
|
return var
|
||||||
|
|
||||||
|
|
||||||
|
def create_overlapping_windows(batch_x):
|
||||||
|
batch_size = tf.shape(input=batch_x)[0]
|
||||||
|
window_width = 2 * Config.n_context + 1
|
||||||
|
num_channels = Config.n_input
|
||||||
|
|
||||||
|
# Create a constant convolution filter using an identity matrix, so that the
|
||||||
|
# convolution returns patches of the input tensor as is, and we can create
|
||||||
|
# overlapping windows over the MFCCs.
|
||||||
|
eye_filter = tf.constant(
|
||||||
|
np.eye(window_width * num_channels).reshape(
|
||||||
|
window_width, num_channels, window_width * num_channels
|
||||||
|
),
|
||||||
|
tf.float32,
|
||||||
|
) # pylint: disable=bad-continuation
|
||||||
|
|
||||||
|
# Create overlapping windows
|
||||||
|
batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
|
||||||
|
|
||||||
|
# Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
|
||||||
|
batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
|
||||||
|
|
||||||
|
return batch_x
|
||||||
|
|
||||||
|
|
||||||
|
def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
|
||||||
|
with tfv1.variable_scope(name):
|
||||||
|
bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
|
||||||
|
weights = variable_on_cpu(
|
||||||
|
"weights",
|
||||||
|
[x.shape[-1], units],
|
||||||
|
tfv1.keras.initializers.VarianceScaling(
|
||||||
|
scale=1.0, mode="fan_avg", distribution="uniform"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tf.nn.bias_add(tf.matmul(x, weights), bias)
|
||||||
|
|
||||||
|
if relu:
|
||||||
|
output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
|
||||||
|
|
||||||
|
if layer_norm:
|
||||||
|
with tfv1.variable_scope(name):
|
||||||
|
output = tf.contrib.layers.layer_norm(output)
|
||||||
|
|
||||||
|
if dropout_rate is not None:
|
||||||
|
output = tf.nn.dropout(output, rate=dropout_rate)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
|
||||||
|
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
|
||||||
|
fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
|
||||||
|
Config.n_cell_dim,
|
||||||
|
forget_bias=0,
|
||||||
|
reuse=reuse,
|
||||||
|
name="cudnn_compatible_lstm_cell",
|
||||||
|
)
|
||||||
|
|
||||||
|
output, output_state = fw_cell(
|
||||||
|
inputs=x,
|
||||||
|
dtype=tf.float32,
|
||||||
|
sequence_length=seq_length,
|
||||||
|
initial_state=previous_state,
|
||||||
|
)
|
||||||
|
|
||||||
|
return output, output_state
|
||||||
|
|
||||||
|
|
||||||
|
def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
|
||||||
|
assert (
|
||||||
|
previous_state is None
|
||||||
|
) # 'Passing previous state not supported with CuDNN backend'
|
||||||
|
|
||||||
|
# Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
|
||||||
|
# the object it creates the variables, and then you just call it several times
|
||||||
|
# to enable variable re-use. Because all of our code is structure in an old
|
||||||
|
# school TensorFlow structure where you can just call tf.get_variable again with
|
||||||
|
# reuse=True to reuse variables, we can't easily make use of the object oriented
|
||||||
|
# way CudnnLSTM is implemented, so we save a singleton instance in the function,
|
||||||
|
# emulating a static function variable.
|
||||||
|
if not rnn_impl_cudnn_rnn.cell:
|
||||||
|
# Forward direction cell:
|
||||||
|
fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
|
||||||
|
num_layers=1,
|
||||||
|
num_units=Config.n_cell_dim,
|
||||||
|
input_mode="linear_input",
|
||||||
|
direction="unidirectional",
|
||||||
|
dtype=tf.float32,
|
||||||
|
)
|
||||||
|
rnn_impl_cudnn_rnn.cell = fw_cell
|
||||||
|
|
||||||
|
output, output_state = rnn_impl_cudnn_rnn.cell(
|
||||||
|
inputs=x, sequence_lengths=seq_length
|
||||||
|
)
|
||||||
|
|
||||||
|
return output, output_state
|
||||||
|
|
||||||
|
|
||||||
|
rnn_impl_cudnn_rnn.cell = None
|
||||||
|
|
||||||
|
|
||||||
|
def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
|
||||||
|
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
|
||||||
|
# Forward direction cell:
|
||||||
|
fw_cell = tfv1.nn.rnn_cell.LSTMCell(
|
||||||
|
Config.n_cell_dim,
|
||||||
|
forget_bias=0,
|
||||||
|
reuse=reuse,
|
||||||
|
name="cudnn_compatible_lstm_cell",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split rank N tensor into list of rank N-1 tensors
|
||||||
|
x = [x[l] for l in range(x.shape[0])]
|
||||||
|
|
||||||
|
output, output_state = tfv1.nn.static_rnn(
|
||||||
|
cell=fw_cell,
|
||||||
|
inputs=x,
|
||||||
|
sequence_length=seq_length,
|
||||||
|
initial_state=previous_state,
|
||||||
|
dtype=tf.float32,
|
||||||
|
scope="cell_0",
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tf.concat(output, 0)
|
||||||
|
|
||||||
|
return output, output_state
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(
|
||||||
|
batch_x,
|
||||||
|
seq_length,
|
||||||
|
dropout,
|
||||||
|
reuse=False,
|
||||||
|
batch_size=None,
|
||||||
|
previous_state=None,
|
||||||
|
overlap=True,
|
||||||
|
rnn_impl=rnn_impl_lstmblockfusedcell,
|
||||||
|
):
|
||||||
|
layers = {}
|
||||||
|
|
||||||
|
# Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
|
||||||
|
if not batch_size:
|
||||||
|
batch_size = tf.shape(input=batch_x)[0]
|
||||||
|
|
||||||
|
# Create overlapping feature windows if needed
|
||||||
|
if overlap:
|
||||||
|
batch_x = create_overlapping_windows(batch_x)
|
||||||
|
|
||||||
|
# Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
|
||||||
|
# This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
|
||||||
|
|
||||||
|
# Permute n_steps and batch_size
|
||||||
|
batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
|
||||||
|
# Reshape to prepare input for first layer
|
||||||
|
batch_x = tf.reshape(
|
||||||
|
batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
|
||||||
|
) # (n_steps*batch_size, n_input + 2*n_input*n_context)
|
||||||
|
layers["input_reshaped"] = batch_x
|
||||||
|
|
||||||
|
# The next three blocks will pass `batch_x` through three hidden layers with
|
||||||
|
# clipped RELU activation and dropout.
|
||||||
|
layers["layer_1"] = layer_1 = dense(
|
||||||
|
"layer_1",
|
||||||
|
batch_x,
|
||||||
|
Config.n_hidden_1,
|
||||||
|
dropout_rate=dropout[0],
|
||||||
|
layer_norm=Config.layer_norm,
|
||||||
|
)
|
||||||
|
layers["layer_2"] = layer_2 = dense(
|
||||||
|
"layer_2",
|
||||||
|
layer_1,
|
||||||
|
Config.n_hidden_2,
|
||||||
|
dropout_rate=dropout[1],
|
||||||
|
layer_norm=Config.layer_norm,
|
||||||
|
)
|
||||||
|
layers["layer_3"] = layer_3 = dense(
|
||||||
|
"layer_3",
|
||||||
|
layer_2,
|
||||||
|
Config.n_hidden_3,
|
||||||
|
dropout_rate=dropout[2],
|
||||||
|
layer_norm=Config.layer_norm,
|
||||||
|
)
|
||||||
|
|
||||||
|
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
|
||||||
|
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
|
||||||
|
layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
|
||||||
|
|
||||||
|
# Run through parametrized RNN implementation, as we use different RNNs
|
||||||
|
# for training and inference
|
||||||
|
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
|
||||||
|
|
||||||
|
# Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
|
||||||
|
# to a tensor of shape [n_steps*batch_size, n_cell_dim]
|
||||||
|
output = tf.reshape(output, [-1, Config.n_cell_dim])
|
||||||
|
layers["rnn_output"] = output
|
||||||
|
layers["rnn_output_state"] = output_state
|
||||||
|
|
||||||
|
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
|
||||||
|
layers["layer_5"] = layer_5 = dense(
|
||||||
|
"layer_5",
|
||||||
|
output,
|
||||||
|
Config.n_hidden_5,
|
||||||
|
dropout_rate=dropout[5],
|
||||||
|
layer_norm=Config.layer_norm,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
|
||||||
|
layers["layer_6"] = layer_6 = dense(
|
||||||
|
"layer_6", layer_5, Config.n_hidden_6, relu=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
|
||||||
|
# to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
|
||||||
|
# Note, that this differs from the input in that it is time-major.
|
||||||
|
layer_6 = tf.reshape(
|
||||||
|
layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
|
||||||
|
)
|
||||||
|
layers["raw_logits"] = layer_6
|
||||||
|
|
||||||
|
# Output shape: [n_steps, batch_size, n_hidden_6]
|
||||||
|
return layer_6, layers
|
||||||
|
|
||||||
|
|
||||||
|
def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
|
||||||
|
batch_size = batch_size if batch_size > 0 else None
|
||||||
|
|
||||||
|
# Create feature computation graph
|
||||||
|
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
input_samples = tfv1.placeholder(
|
||||||
|
tf.float32, [Config.audio_window_samples], "input_samples"
|
||||||
|
)
|
||||||
|
samples = tf.expand_dims(input_samples, -1)
|
||||||
|
mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
mfccs = tf.identity(mfccs, name="mfccs")
|
||||||
|
|
||||||
|
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
|
||||||
|
# This shape is read by the native_client in STT_CreateModel to know the
|
||||||
|
# value of n_steps, n_context and n_input. Make sure you update the code
|
||||||
|
# there if this shape is changed.
|
||||||
|
#
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
input_tensor = tfv1.placeholder(
|
||||||
|
tf.float32,
|
||||||
|
[
|
||||||
|
batch_size,
|
||||||
|
n_steps if n_steps > 0 else None,
|
||||||
|
2 * Config.n_context + 1,
|
||||||
|
Config.n_input,
|
||||||
|
],
|
||||||
|
name="input_node",
|
||||||
|
)
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
|
||||||
|
|
||||||
|
if batch_size <= 0:
|
||||||
|
# no state management since n_step is expected to be dynamic too (see below)
|
||||||
|
previous_state = None
|
||||||
|
else:
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
previous_state_c = tfv1.placeholder(
|
||||||
|
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
|
||||||
|
)
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
previous_state_h = tfv1.placeholder(
|
||||||
|
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
|
||||||
|
)
|
||||||
|
|
||||||
|
previous_state = tf.nn.rnn_cell.LSTMStateTuple(
|
||||||
|
previous_state_c, previous_state_h
|
||||||
|
)
|
||||||
|
|
||||||
|
# One rate per layer
|
||||||
|
no_dropout = [None] * 6
|
||||||
|
|
||||||
|
if tflite:
|
||||||
|
rnn_impl = rnn_impl_static_rnn
|
||||||
|
else:
|
||||||
|
rnn_impl = rnn_impl_lstmblockfusedcell
|
||||||
|
|
||||||
|
logits, layers = create_model(
|
||||||
|
batch_x=input_tensor,
|
||||||
|
batch_size=batch_size,
|
||||||
|
seq_length=seq_length if not Config.export_tflite else None,
|
||||||
|
dropout=no_dropout,
|
||||||
|
previous_state=previous_state,
|
||||||
|
overlap=False,
|
||||||
|
rnn_impl=rnn_impl,
|
||||||
|
)
|
||||||
|
|
||||||
|
# TF Lite runtime will check that input dimensions are 1, 2 or 4
|
||||||
|
# by default we get 3, the middle one being batch_size which is forced to
|
||||||
|
# one on inference graph, so remove that dimension
|
||||||
|
#
|
||||||
|
# native_client: this node's name and shape are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
if tflite:
|
||||||
|
logits = tf.squeeze(logits, [1])
|
||||||
|
|
||||||
|
# Apply softmax for CTC decoder
|
||||||
|
probs = tf.nn.softmax(logits, name="logits")
|
||||||
|
|
||||||
|
if batch_size <= 0:
|
||||||
|
if tflite:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"dynamic batch_size does not support tflite nor streaming"
|
||||||
|
)
|
||||||
|
if n_steps > 0:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"dynamic batch_size expect n_steps to be dynamic too"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
{
|
||||||
|
"input": input_tensor,
|
||||||
|
"input_lengths": seq_length,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"outputs": probs,
|
||||||
|
},
|
||||||
|
layers,
|
||||||
|
)
|
||||||
|
|
||||||
|
new_state_c, new_state_h = layers["rnn_output_state"]
|
||||||
|
new_state_c = tf.identity(new_state_c, name="new_state_c")
|
||||||
|
new_state_h = tf.identity(new_state_h, name="new_state_h")
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
"input": input_tensor,
|
||||||
|
"previous_state_c": previous_state_c,
|
||||||
|
"previous_state_h": previous_state_h,
|
||||||
|
"input_samples": input_samples,
|
||||||
|
}
|
||||||
|
|
||||||
|
if not Config.export_tflite:
|
||||||
|
inputs["input_lengths"] = seq_length
|
||||||
|
|
||||||
|
outputs = {
|
||||||
|
"outputs": probs,
|
||||||
|
"new_state_c": new_state_c,
|
||||||
|
"new_state_h": new_state_h,
|
||||||
|
"mfccs": mfccs,
|
||||||
|
# Expose internal layers for downstream applications
|
||||||
|
"layer_3": layers["layer_3"],
|
||||||
|
"layer_5": layers["layer_5"],
|
||||||
|
}
|
||||||
|
|
||||||
|
return inputs, outputs, layers
|
|
@ -13,6 +13,7 @@ from six.moves import zip
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from .deepspeech_model import create_model
|
||||||
from .util.augmentations import NormalizeSampleRate
|
from .util.augmentations import NormalizeSampleRate
|
||||||
from .util.checkpoints import load_graph_for_evaluation
|
from .util.checkpoints import load_graph_for_evaluation
|
||||||
from .util.config import (
|
from .util.config import (
|
||||||
|
@ -168,25 +169,25 @@ def evaluate(test_csvs, create_model):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def test():
|
||||||
initialize_globals_from_cli()
|
tfv1.reset_default_graph()
|
||||||
|
|
||||||
if not Config.test_files:
|
|
||||||
log_error(
|
|
||||||
"You need to specify what files to use for evaluation via "
|
|
||||||
"the --test_files flag."
|
|
||||||
)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
from .train import ( # pylint: disable=cyclic-import,import-outside-toplevel
|
|
||||||
create_model,
|
|
||||||
)
|
|
||||||
|
|
||||||
samples = evaluate(Config.test_files, create_model)
|
samples = evaluate(Config.test_files, create_model)
|
||||||
|
|
||||||
if Config.test_output_file:
|
if Config.test_output_file:
|
||||||
save_samples_json(samples, Config.test_output_file)
|
save_samples_json(samples, Config.test_output_file)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
initialize_globals_from_cli()
|
||||||
|
|
||||||
|
if not Config.test_files:
|
||||||
|
raise RuntimeError(
|
||||||
|
"You need to specify what files to use for evaluation via "
|
||||||
|
"the --test_files flag."
|
||||||
|
)
|
||||||
|
|
||||||
|
test()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -0,0 +1,216 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
|
||||||
|
DESIRED_LOG_LEVEL = (
|
||||||
|
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
|
||||||
|
)
|
||||||
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow.compat.v1 as tfv1
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from .deepspeech_model import create_inference_graph
|
||||||
|
from .util.checkpoints import load_graph_for_evaluation
|
||||||
|
from .util.config import Config, initialize_globals_from_cli, log_error, log_info
|
||||||
|
from .util.io import (
|
||||||
|
open_remote,
|
||||||
|
rmtree_remote,
|
||||||
|
listdir_remote,
|
||||||
|
is_remote_path,
|
||||||
|
isdir_remote,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def file_relative_read(fname):
|
||||||
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
|
|
||||||
|
|
||||||
|
def export():
|
||||||
|
r"""
|
||||||
|
Restores the trained variables into a simpler graph that will be exported for serving.
|
||||||
|
"""
|
||||||
|
log_info("Exporting the model...")
|
||||||
|
|
||||||
|
tfv1.reset_default_graph()
|
||||||
|
|
||||||
|
inputs, outputs, _ = create_inference_graph(
|
||||||
|
batch_size=Config.export_batch_size,
|
||||||
|
n_steps=Config.n_steps,
|
||||||
|
tflite=Config.export_tflite,
|
||||||
|
)
|
||||||
|
|
||||||
|
graph_version = int(file_relative_read("GRAPH_VERSION").strip())
|
||||||
|
assert graph_version > 0
|
||||||
|
|
||||||
|
# native_client: these nodes's names and shapes are part of the API boundary
|
||||||
|
# with the native client, if you change them you should sync changes with
|
||||||
|
# the C++ code.
|
||||||
|
outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
|
||||||
|
outputs["metadata_sample_rate"] = tf.constant(
|
||||||
|
[Config.audio_sample_rate], name="metadata_sample_rate"
|
||||||
|
)
|
||||||
|
outputs["metadata_feature_win_len"] = tf.constant(
|
||||||
|
[Config.feature_win_len], name="metadata_feature_win_len"
|
||||||
|
)
|
||||||
|
outputs["metadata_feature_win_step"] = tf.constant(
|
||||||
|
[Config.feature_win_step], name="metadata_feature_win_step"
|
||||||
|
)
|
||||||
|
outputs["metadata_beam_width"] = tf.constant(
|
||||||
|
[Config.export_beam_width], name="metadata_beam_width"
|
||||||
|
)
|
||||||
|
outputs["metadata_alphabet"] = tf.constant(
|
||||||
|
[Config.alphabet.Serialize()], name="metadata_alphabet"
|
||||||
|
)
|
||||||
|
|
||||||
|
if Config.export_language:
|
||||||
|
outputs["metadata_language"] = tf.constant(
|
||||||
|
[Config.export_language.encode("utf-8")], name="metadata_language"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prevent further graph changes
|
||||||
|
tfv1.get_default_graph().finalize()
|
||||||
|
|
||||||
|
output_names_tensors = [
|
||||||
|
tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
|
||||||
|
]
|
||||||
|
output_names_ops = [
|
||||||
|
op.name for op in outputs.values() if isinstance(op, tf.Operation)
|
||||||
|
]
|
||||||
|
output_names = output_names_tensors + output_names_ops
|
||||||
|
|
||||||
|
with tf.Session() as session:
|
||||||
|
# Restore variables from checkpoint
|
||||||
|
load_graph_for_evaluation(session)
|
||||||
|
|
||||||
|
output_filename = Config.export_file_name + ".pb"
|
||||||
|
if Config.remove_export:
|
||||||
|
if isdir_remote(Config.export_dir):
|
||||||
|
log_info("Removing old export")
|
||||||
|
rmtree_remote(Config.export_dir)
|
||||||
|
|
||||||
|
output_graph_path = os.path.join(Config.export_dir, output_filename)
|
||||||
|
|
||||||
|
if not is_remote_path(Config.export_dir) and not os.path.isdir(
|
||||||
|
Config.export_dir
|
||||||
|
):
|
||||||
|
os.makedirs(Config.export_dir)
|
||||||
|
|
||||||
|
frozen_graph = tfv1.graph_util.convert_variables_to_constants(
|
||||||
|
sess=session,
|
||||||
|
input_graph_def=tfv1.get_default_graph().as_graph_def(),
|
||||||
|
output_node_names=output_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
frozen_graph = tfv1.graph_util.extract_sub_graph(
|
||||||
|
graph_def=frozen_graph, dest_nodes=output_names
|
||||||
|
)
|
||||||
|
|
||||||
|
if not Config.export_tflite:
|
||||||
|
with open_remote(output_graph_path, "wb") as fout:
|
||||||
|
fout.write(frozen_graph.SerializeToString())
|
||||||
|
else:
|
||||||
|
output_tflite_path = os.path.join(
|
||||||
|
Config.export_dir, output_filename.replace(".pb", ".tflite")
|
||||||
|
)
|
||||||
|
|
||||||
|
converter = tf.lite.TFLiteConverter(
|
||||||
|
frozen_graph,
|
||||||
|
input_tensors=inputs.values(),
|
||||||
|
output_tensors=outputs.values(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if Config.export_quantize:
|
||||||
|
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
||||||
|
|
||||||
|
# AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
|
||||||
|
converter.allow_custom_ops = True
|
||||||
|
tflite_model = converter.convert()
|
||||||
|
|
||||||
|
with open_remote(output_tflite_path, "wb") as fout:
|
||||||
|
fout.write(tflite_model)
|
||||||
|
|
||||||
|
log_info("Models exported at %s" % (Config.export_dir))
|
||||||
|
|
||||||
|
metadata_fname = os.path.join(
|
||||||
|
Config.export_dir,
|
||||||
|
"{}_{}_{}.md".format(
|
||||||
|
Config.export_author_id,
|
||||||
|
Config.export_model_name,
|
||||||
|
Config.export_model_version,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
model_runtime = "tflite" if Config.export_tflite else "tensorflow"
|
||||||
|
with open_remote(metadata_fname, "w") as f:
|
||||||
|
f.write("---\n")
|
||||||
|
f.write("author: {}\n".format(Config.export_author_id))
|
||||||
|
f.write("model_name: {}\n".format(Config.export_model_name))
|
||||||
|
f.write("model_version: {}\n".format(Config.export_model_version))
|
||||||
|
f.write("contact_info: {}\n".format(Config.export_contact_info))
|
||||||
|
f.write("license: {}\n".format(Config.export_license))
|
||||||
|
f.write("language: {}\n".format(Config.export_language))
|
||||||
|
f.write("runtime: {}\n".format(model_runtime))
|
||||||
|
f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
|
||||||
|
f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
|
||||||
|
f.write(
|
||||||
|
"acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
|
||||||
|
)
|
||||||
|
f.write(
|
||||||
|
"scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
|
||||||
|
)
|
||||||
|
f.write("---\n")
|
||||||
|
f.write("{}\n".format(Config.export_description))
|
||||||
|
|
||||||
|
log_info(
|
||||||
|
"Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
|
||||||
|
metadata_fname
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def package_zip():
|
||||||
|
# --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
|
||||||
|
export_dir = os.path.join(
|
||||||
|
os.path.abspath(Config.export_dir), ""
|
||||||
|
) # Force ending '/'
|
||||||
|
if is_remote_path(export_dir):
|
||||||
|
log_error(
|
||||||
|
"Cannot package remote path zip %s. Please do this manually." % export_dir
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
zip_filename = os.path.dirname(export_dir)
|
||||||
|
|
||||||
|
shutil.copy(Config.scorer_path, export_dir)
|
||||||
|
|
||||||
|
archive = shutil.make_archive(zip_filename, "zip", export_dir)
|
||||||
|
log_info("Exported packaged model {}".format(archive))
|
||||||
|
|
||||||
|
|
||||||
|
def main(_):
|
||||||
|
initialize_globals_from_cli()
|
||||||
|
|
||||||
|
if not Config.export_dir:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Calling export script directly but no --export_dir specified"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not Config.export_zip:
|
||||||
|
# Export to folder
|
||||||
|
export()
|
||||||
|
else:
|
||||||
|
if listdir_remote(Config.export_dir):
|
||||||
|
raise RuntimeError(
|
||||||
|
"Directory {} is not empty, please fix this.".format(Config.export_dir)
|
||||||
|
)
|
||||||
|
|
||||||
|
export()
|
||||||
|
package_zip()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -14,12 +14,13 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import progressbar
|
import progressbar
|
||||||
import tensorflow.compat.v1 as tfv1
|
import tensorflow.compat.v1 as tfv1
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
from coqui_stt_ctcdecoder import Scorer
|
||||||
|
|
||||||
tfv1.logging.set_verbosity(
|
tfv1.logging.set_verbosity(
|
||||||
{
|
{
|
||||||
|
@ -30,12 +31,15 @@ tfv1.logging.set_verbosity(
|
||||||
}.get(DESIRED_LOG_LEVEL)
|
}.get(DESIRED_LOG_LEVEL)
|
||||||
)
|
)
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
from coqui_stt_ctcdecoder import Scorer, ctc_beam_search_decoder
|
from . import evaluate
|
||||||
from six.moves import range, zip
|
from . import export
|
||||||
|
from . import training_graph_inference
|
||||||
from .evaluate import evaluate
|
from .deepspeech_model import (
|
||||||
|
create_model,
|
||||||
|
rnn_impl_lstmblockfusedcell,
|
||||||
|
rnn_impl_cudnn_rnn,
|
||||||
|
)
|
||||||
from .util.augmentations import NormalizeSampleRate
|
from .util.augmentations import NormalizeSampleRate
|
||||||
from .util.checkpoints import (
|
from .util.checkpoints import (
|
||||||
load_graph_for_evaluation,
|
load_graph_for_evaluation,
|
||||||
|
@ -52,260 +56,16 @@ from .util.config import (
|
||||||
log_progress,
|
log_progress,
|
||||||
log_warn,
|
log_warn,
|
||||||
)
|
)
|
||||||
from .util.evaluate_tools import save_samples_json
|
from .util.feeding import create_dataset
|
||||||
from .util.feeding import audio_to_features, audiofile_to_features, create_dataset
|
|
||||||
from .util.helpers import ExceptionBox, check_ctcdecoder_version
|
from .util.helpers import ExceptionBox, check_ctcdecoder_version
|
||||||
from .util.io import (
|
from .util.io import (
|
||||||
is_remote_path,
|
is_remote_path,
|
||||||
isdir_remote,
|
|
||||||
listdir_remote,
|
|
||||||
open_remote,
|
open_remote,
|
||||||
remove_remote,
|
remove_remote,
|
||||||
)
|
)
|
||||||
|
|
||||||
check_ctcdecoder_version()
|
check_ctcdecoder_version()
|
||||||
|
|
||||||
# Graph Creation
|
|
||||||
# ==============
|
|
||||||
|
|
||||||
|
|
||||||
def variable_on_cpu(name, shape, initializer):
|
|
||||||
r"""
|
|
||||||
Next we concern ourselves with graph creation.
|
|
||||||
However, before we do so we must introduce a utility function ``variable_on_cpu()``
|
|
||||||
used to create a variable in CPU memory.
|
|
||||||
"""
|
|
||||||
# Use the /cpu:0 device for scoped operations
|
|
||||||
with tf.device(Config.cpu_device):
|
|
||||||
# Create or get apropos variable
|
|
||||||
var = tfv1.get_variable(name=name, shape=shape, initializer=initializer)
|
|
||||||
return var
|
|
||||||
|
|
||||||
|
|
||||||
def create_overlapping_windows(batch_x):
|
|
||||||
batch_size = tf.shape(input=batch_x)[0]
|
|
||||||
window_width = 2 * Config.n_context + 1
|
|
||||||
num_channels = Config.n_input
|
|
||||||
|
|
||||||
# Create a constant convolution filter using an identity matrix, so that the
|
|
||||||
# convolution returns patches of the input tensor as is, and we can create
|
|
||||||
# overlapping windows over the MFCCs.
|
|
||||||
eye_filter = tf.constant(
|
|
||||||
np.eye(window_width * num_channels).reshape(
|
|
||||||
window_width, num_channels, window_width * num_channels
|
|
||||||
),
|
|
||||||
tf.float32,
|
|
||||||
) # pylint: disable=bad-continuation
|
|
||||||
|
|
||||||
# Create overlapping windows
|
|
||||||
batch_x = tf.nn.conv1d(input=batch_x, filters=eye_filter, stride=1, padding="SAME")
|
|
||||||
|
|
||||||
# Remove dummy depth dimension and reshape into [batch_size, n_windows, window_width, n_input]
|
|
||||||
batch_x = tf.reshape(batch_x, [batch_size, -1, window_width, num_channels])
|
|
||||||
|
|
||||||
return batch_x
|
|
||||||
|
|
||||||
|
|
||||||
def dense(name, x, units, dropout_rate=None, relu=True, layer_norm=False):
|
|
||||||
with tfv1.variable_scope(name):
|
|
||||||
bias = variable_on_cpu("bias", [units], tfv1.zeros_initializer())
|
|
||||||
weights = variable_on_cpu(
|
|
||||||
"weights",
|
|
||||||
[x.shape[-1], units],
|
|
||||||
tfv1.keras.initializers.VarianceScaling(
|
|
||||||
scale=1.0, mode="fan_avg", distribution="uniform"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
output = tf.nn.bias_add(tf.matmul(x, weights), bias)
|
|
||||||
|
|
||||||
if relu:
|
|
||||||
output = tf.minimum(tf.nn.relu(output), Config.relu_clip)
|
|
||||||
|
|
||||||
if layer_norm:
|
|
||||||
with tfv1.variable_scope(name):
|
|
||||||
output = tf.contrib.layers.layer_norm(output)
|
|
||||||
|
|
||||||
if dropout_rate is not None:
|
|
||||||
output = tf.nn.dropout(output, rate=dropout_rate)
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def rnn_impl_lstmblockfusedcell(x, seq_length, previous_state, reuse):
|
|
||||||
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell/cell_0"):
|
|
||||||
fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(
|
|
||||||
Config.n_cell_dim,
|
|
||||||
forget_bias=0,
|
|
||||||
reuse=reuse,
|
|
||||||
name="cudnn_compatible_lstm_cell",
|
|
||||||
)
|
|
||||||
|
|
||||||
output, output_state = fw_cell(
|
|
||||||
inputs=x,
|
|
||||||
dtype=tf.float32,
|
|
||||||
sequence_length=seq_length,
|
|
||||||
initial_state=previous_state,
|
|
||||||
)
|
|
||||||
|
|
||||||
return output, output_state
|
|
||||||
|
|
||||||
|
|
||||||
def rnn_impl_cudnn_rnn(x, seq_length, previous_state, _):
|
|
||||||
assert (
|
|
||||||
previous_state is None
|
|
||||||
) # 'Passing previous state not supported with CuDNN backend'
|
|
||||||
|
|
||||||
# Hack: CudnnLSTM works similarly to Keras layers in that when you instantiate
|
|
||||||
# the object it creates the variables, and then you just call it several times
|
|
||||||
# to enable variable re-use. Because all of our code is structure in an old
|
|
||||||
# school TensorFlow structure where you can just call tf.get_variable again with
|
|
||||||
# reuse=True to reuse variables, we can't easily make use of the object oriented
|
|
||||||
# way CudnnLSTM is implemented, so we save a singleton instance in the function,
|
|
||||||
# emulating a static function variable.
|
|
||||||
if not rnn_impl_cudnn_rnn.cell:
|
|
||||||
# Forward direction cell:
|
|
||||||
fw_cell = tf.contrib.cudnn_rnn.CudnnLSTM(
|
|
||||||
num_layers=1,
|
|
||||||
num_units=Config.n_cell_dim,
|
|
||||||
input_mode="linear_input",
|
|
||||||
direction="unidirectional",
|
|
||||||
dtype=tf.float32,
|
|
||||||
)
|
|
||||||
rnn_impl_cudnn_rnn.cell = fw_cell
|
|
||||||
|
|
||||||
output, output_state = rnn_impl_cudnn_rnn.cell(
|
|
||||||
inputs=x, sequence_lengths=seq_length
|
|
||||||
)
|
|
||||||
|
|
||||||
return output, output_state
|
|
||||||
|
|
||||||
|
|
||||||
rnn_impl_cudnn_rnn.cell = None
|
|
||||||
|
|
||||||
|
|
||||||
def rnn_impl_static_rnn(x, seq_length, previous_state, reuse):
|
|
||||||
with tfv1.variable_scope("cudnn_lstm/rnn/multi_rnn_cell"):
|
|
||||||
# Forward direction cell:
|
|
||||||
fw_cell = tfv1.nn.rnn_cell.LSTMCell(
|
|
||||||
Config.n_cell_dim,
|
|
||||||
forget_bias=0,
|
|
||||||
reuse=reuse,
|
|
||||||
name="cudnn_compatible_lstm_cell",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Split rank N tensor into list of rank N-1 tensors
|
|
||||||
x = [x[l] for l in range(x.shape[0])]
|
|
||||||
|
|
||||||
output, output_state = tfv1.nn.static_rnn(
|
|
||||||
cell=fw_cell,
|
|
||||||
inputs=x,
|
|
||||||
sequence_length=seq_length,
|
|
||||||
initial_state=previous_state,
|
|
||||||
dtype=tf.float32,
|
|
||||||
scope="cell_0",
|
|
||||||
)
|
|
||||||
|
|
||||||
output = tf.concat(output, 0)
|
|
||||||
|
|
||||||
return output, output_state
|
|
||||||
|
|
||||||
|
|
||||||
def create_model(
|
|
||||||
batch_x,
|
|
||||||
seq_length,
|
|
||||||
dropout,
|
|
||||||
reuse=False,
|
|
||||||
batch_size=None,
|
|
||||||
previous_state=None,
|
|
||||||
overlap=True,
|
|
||||||
rnn_impl=rnn_impl_lstmblockfusedcell,
|
|
||||||
):
|
|
||||||
layers = {}
|
|
||||||
|
|
||||||
# Input shape: [batch_size, n_steps, n_input + 2*n_input*n_context]
|
|
||||||
if not batch_size:
|
|
||||||
batch_size = tf.shape(input=batch_x)[0]
|
|
||||||
|
|
||||||
# Create overlapping feature windows if needed
|
|
||||||
if overlap:
|
|
||||||
batch_x = create_overlapping_windows(batch_x)
|
|
||||||
|
|
||||||
# Reshaping `batch_x` to a tensor with shape `[n_steps*batch_size, n_input + 2*n_input*n_context]`.
|
|
||||||
# This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
|
|
||||||
|
|
||||||
# Permute n_steps and batch_size
|
|
||||||
batch_x = tf.transpose(a=batch_x, perm=[1, 0, 2, 3])
|
|
||||||
# Reshape to prepare input for first layer
|
|
||||||
batch_x = tf.reshape(
|
|
||||||
batch_x, [-1, Config.n_input + 2 * Config.n_input * Config.n_context]
|
|
||||||
) # (n_steps*batch_size, n_input + 2*n_input*n_context)
|
|
||||||
layers["input_reshaped"] = batch_x
|
|
||||||
|
|
||||||
# The next three blocks will pass `batch_x` through three hidden layers with
|
|
||||||
# clipped RELU activation and dropout.
|
|
||||||
layers["layer_1"] = layer_1 = dense(
|
|
||||||
"layer_1",
|
|
||||||
batch_x,
|
|
||||||
Config.n_hidden_1,
|
|
||||||
dropout_rate=dropout[0],
|
|
||||||
layer_norm=Config.layer_norm,
|
|
||||||
)
|
|
||||||
layers["layer_2"] = layer_2 = dense(
|
|
||||||
"layer_2",
|
|
||||||
layer_1,
|
|
||||||
Config.n_hidden_2,
|
|
||||||
dropout_rate=dropout[1],
|
|
||||||
layer_norm=Config.layer_norm,
|
|
||||||
)
|
|
||||||
layers["layer_3"] = layer_3 = dense(
|
|
||||||
"layer_3",
|
|
||||||
layer_2,
|
|
||||||
Config.n_hidden_3,
|
|
||||||
dropout_rate=dropout[2],
|
|
||||||
layer_norm=Config.layer_norm,
|
|
||||||
)
|
|
||||||
|
|
||||||
# `layer_3` is now reshaped into `[n_steps, batch_size, 2*n_cell_dim]`,
|
|
||||||
# as the LSTM RNN expects its input to be of shape `[max_time, batch_size, input_size]`.
|
|
||||||
layer_3 = tf.reshape(layer_3, [-1, batch_size, Config.n_hidden_3])
|
|
||||||
|
|
||||||
# Run through parametrized RNN implementation, as we use different RNNs
|
|
||||||
# for training and inference
|
|
||||||
output, output_state = rnn_impl(layer_3, seq_length, previous_state, reuse)
|
|
||||||
|
|
||||||
# Reshape output from a tensor of shape [n_steps, batch_size, n_cell_dim]
|
|
||||||
# to a tensor of shape [n_steps*batch_size, n_cell_dim]
|
|
||||||
output = tf.reshape(output, [-1, Config.n_cell_dim])
|
|
||||||
layers["rnn_output"] = output
|
|
||||||
layers["rnn_output_state"] = output_state
|
|
||||||
|
|
||||||
# Now we feed `output` to the fifth hidden layer with clipped RELU activation
|
|
||||||
layers["layer_5"] = layer_5 = dense(
|
|
||||||
"layer_5",
|
|
||||||
output,
|
|
||||||
Config.n_hidden_5,
|
|
||||||
dropout_rate=dropout[5],
|
|
||||||
layer_norm=Config.layer_norm,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Now we apply a final linear layer creating `n_classes` dimensional vectors, the logits.
|
|
||||||
layers["layer_6"] = layer_6 = dense(
|
|
||||||
"layer_6", layer_5, Config.n_hidden_6, relu=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Finally we reshape layer_6 from a tensor of shape [n_steps*batch_size, n_hidden_6]
|
|
||||||
# to the slightly more useful shape [n_steps, batch_size, n_hidden_6].
|
|
||||||
# Note, that this differs from the input in that it is time-major.
|
|
||||||
layer_6 = tf.reshape(
|
|
||||||
layer_6, [-1, batch_size, Config.n_hidden_6], name="raw_logits"
|
|
||||||
)
|
|
||||||
layers["raw_logits"] = layer_6
|
|
||||||
|
|
||||||
# Output shape: [n_steps, batch_size, n_hidden_6]
|
|
||||||
return layer_6, layers
|
|
||||||
|
|
||||||
|
|
||||||
# Accuracy and Loss
|
# Accuracy and Loss
|
||||||
# =================
|
# =================
|
||||||
|
|
||||||
|
@ -900,371 +660,6 @@ def train():
|
||||||
log_debug("Session closed.")
|
log_debug("Session closed.")
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
tfv1.reset_default_graph()
|
|
||||||
|
|
||||||
samples = evaluate(Config.test_files, create_model)
|
|
||||||
if Config.test_output_file:
|
|
||||||
save_samples_json(samples, Config.test_output_file)
|
|
||||||
|
|
||||||
|
|
||||||
def create_inference_graph(batch_size=1, n_steps=16, tflite=False):
|
|
||||||
batch_size = batch_size if batch_size > 0 else None
|
|
||||||
|
|
||||||
# Create feature computation graph
|
|
||||||
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
input_samples = tfv1.placeholder(
|
|
||||||
tf.float32, [Config.audio_window_samples], "input_samples"
|
|
||||||
)
|
|
||||||
samples = tf.expand_dims(input_samples, -1)
|
|
||||||
mfccs, _ = audio_to_features(samples, Config.audio_sample_rate)
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
mfccs = tf.identity(mfccs, name="mfccs")
|
|
||||||
|
|
||||||
# Input tensor will be of shape [batch_size, n_steps, 2*n_context+1, n_input]
|
|
||||||
# This shape is read by the native_client in STT_CreateModel to know the
|
|
||||||
# value of n_steps, n_context and n_input. Make sure you update the code
|
|
||||||
# there if this shape is changed.
|
|
||||||
#
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
input_tensor = tfv1.placeholder(
|
|
||||||
tf.float32,
|
|
||||||
[
|
|
||||||
batch_size,
|
|
||||||
n_steps if n_steps > 0 else None,
|
|
||||||
2 * Config.n_context + 1,
|
|
||||||
Config.n_input,
|
|
||||||
],
|
|
||||||
name="input_node",
|
|
||||||
)
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
seq_length = tfv1.placeholder(tf.int32, [batch_size], name="input_lengths")
|
|
||||||
|
|
||||||
if batch_size <= 0:
|
|
||||||
# no state management since n_step is expected to be dynamic too (see below)
|
|
||||||
previous_state = None
|
|
||||||
else:
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
previous_state_c = tfv1.placeholder(
|
|
||||||
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_c"
|
|
||||||
)
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
previous_state_h = tfv1.placeholder(
|
|
||||||
tf.float32, [batch_size, Config.n_cell_dim], name="previous_state_h"
|
|
||||||
)
|
|
||||||
|
|
||||||
previous_state = tf.nn.rnn_cell.LSTMStateTuple(
|
|
||||||
previous_state_c, previous_state_h
|
|
||||||
)
|
|
||||||
|
|
||||||
# One rate per layer
|
|
||||||
no_dropout = [None] * 6
|
|
||||||
|
|
||||||
if tflite:
|
|
||||||
rnn_impl = rnn_impl_static_rnn
|
|
||||||
else:
|
|
||||||
rnn_impl = rnn_impl_lstmblockfusedcell
|
|
||||||
|
|
||||||
logits, layers = create_model(
|
|
||||||
batch_x=input_tensor,
|
|
||||||
batch_size=batch_size,
|
|
||||||
seq_length=seq_length if not Config.export_tflite else None,
|
|
||||||
dropout=no_dropout,
|
|
||||||
previous_state=previous_state,
|
|
||||||
overlap=False,
|
|
||||||
rnn_impl=rnn_impl,
|
|
||||||
)
|
|
||||||
|
|
||||||
# TF Lite runtime will check that input dimensions are 1, 2 or 4
|
|
||||||
# by default we get 3, the middle one being batch_size which is forced to
|
|
||||||
# one on inference graph, so remove that dimension
|
|
||||||
#
|
|
||||||
# native_client: this node's name and shape are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
if tflite:
|
|
||||||
logits = tf.squeeze(logits, [1])
|
|
||||||
|
|
||||||
# Apply softmax for CTC decoder
|
|
||||||
probs = tf.nn.softmax(logits, name="logits")
|
|
||||||
|
|
||||||
if batch_size <= 0:
|
|
||||||
if tflite:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"dynamic batch_size does not support tflite nor streaming"
|
|
||||||
)
|
|
||||||
if n_steps > 0:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"dynamic batch_size expect n_steps to be dynamic too"
|
|
||||||
)
|
|
||||||
return (
|
|
||||||
{
|
|
||||||
"input": input_tensor,
|
|
||||||
"input_lengths": seq_length,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"outputs": probs,
|
|
||||||
},
|
|
||||||
layers,
|
|
||||||
)
|
|
||||||
|
|
||||||
new_state_c, new_state_h = layers["rnn_output_state"]
|
|
||||||
new_state_c = tf.identity(new_state_c, name="new_state_c")
|
|
||||||
new_state_h = tf.identity(new_state_h, name="new_state_h")
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
"input": input_tensor,
|
|
||||||
"previous_state_c": previous_state_c,
|
|
||||||
"previous_state_h": previous_state_h,
|
|
||||||
"input_samples": input_samples,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not Config.export_tflite:
|
|
||||||
inputs["input_lengths"] = seq_length
|
|
||||||
|
|
||||||
outputs = {
|
|
||||||
"outputs": probs,
|
|
||||||
"new_state_c": new_state_c,
|
|
||||||
"new_state_h": new_state_h,
|
|
||||||
"mfccs": mfccs,
|
|
||||||
# Expose internal layers for downstream applications
|
|
||||||
"layer_3": layers["layer_3"],
|
|
||||||
"layer_5": layers["layer_5"],
|
|
||||||
}
|
|
||||||
|
|
||||||
return inputs, outputs, layers
|
|
||||||
|
|
||||||
|
|
||||||
def file_relative_read(fname):
|
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
|
||||||
|
|
||||||
|
|
||||||
def export():
|
|
||||||
r"""
|
|
||||||
Restores the trained variables into a simpler graph that will be exported for serving.
|
|
||||||
"""
|
|
||||||
log_info("Exporting the model...")
|
|
||||||
|
|
||||||
tfv1.reset_default_graph()
|
|
||||||
|
|
||||||
inputs, outputs, _ = create_inference_graph(
|
|
||||||
batch_size=Config.export_batch_size,
|
|
||||||
n_steps=Config.n_steps,
|
|
||||||
tflite=Config.export_tflite,
|
|
||||||
)
|
|
||||||
|
|
||||||
graph_version = int(file_relative_read("GRAPH_VERSION").strip())
|
|
||||||
assert graph_version > 0
|
|
||||||
|
|
||||||
# native_client: these nodes's names and shapes are part of the API boundary
|
|
||||||
# with the native client, if you change them you should sync changes with
|
|
||||||
# the C++ code.
|
|
||||||
outputs["metadata_version"] = tf.constant([graph_version], name="metadata_version")
|
|
||||||
outputs["metadata_sample_rate"] = tf.constant(
|
|
||||||
[Config.audio_sample_rate], name="metadata_sample_rate"
|
|
||||||
)
|
|
||||||
outputs["metadata_feature_win_len"] = tf.constant(
|
|
||||||
[Config.feature_win_len], name="metadata_feature_win_len"
|
|
||||||
)
|
|
||||||
outputs["metadata_feature_win_step"] = tf.constant(
|
|
||||||
[Config.feature_win_step], name="metadata_feature_win_step"
|
|
||||||
)
|
|
||||||
outputs["metadata_beam_width"] = tf.constant(
|
|
||||||
[Config.export_beam_width], name="metadata_beam_width"
|
|
||||||
)
|
|
||||||
outputs["metadata_alphabet"] = tf.constant(
|
|
||||||
[Config.alphabet.Serialize()], name="metadata_alphabet"
|
|
||||||
)
|
|
||||||
|
|
||||||
if Config.export_language:
|
|
||||||
outputs["metadata_language"] = tf.constant(
|
|
||||||
[Config.export_language.encode("utf-8")], name="metadata_language"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prevent further graph changes
|
|
||||||
tfv1.get_default_graph().finalize()
|
|
||||||
|
|
||||||
output_names_tensors = [
|
|
||||||
tensor.op.name for tensor in outputs.values() if isinstance(tensor, tf.Tensor)
|
|
||||||
]
|
|
||||||
output_names_ops = [
|
|
||||||
op.name for op in outputs.values() if isinstance(op, tf.Operation)
|
|
||||||
]
|
|
||||||
output_names = output_names_tensors + output_names_ops
|
|
||||||
|
|
||||||
with tf.Session() as session:
|
|
||||||
# Restore variables from checkpoint
|
|
||||||
load_graph_for_evaluation(session)
|
|
||||||
|
|
||||||
output_filename = Config.export_file_name + ".pb"
|
|
||||||
if Config.remove_export:
|
|
||||||
if isdir_remote(Config.export_dir):
|
|
||||||
log_info("Removing old export")
|
|
||||||
remove_remote(Config.export_dir)
|
|
||||||
|
|
||||||
output_graph_path = os.path.join(Config.export_dir, output_filename)
|
|
||||||
|
|
||||||
if not is_remote_path(Config.export_dir) and not os.path.isdir(
|
|
||||||
Config.export_dir
|
|
||||||
):
|
|
||||||
os.makedirs(Config.export_dir)
|
|
||||||
|
|
||||||
frozen_graph = tfv1.graph_util.convert_variables_to_constants(
|
|
||||||
sess=session,
|
|
||||||
input_graph_def=tfv1.get_default_graph().as_graph_def(),
|
|
||||||
output_node_names=output_names,
|
|
||||||
)
|
|
||||||
|
|
||||||
frozen_graph = tfv1.graph_util.extract_sub_graph(
|
|
||||||
graph_def=frozen_graph, dest_nodes=output_names
|
|
||||||
)
|
|
||||||
|
|
||||||
if not Config.export_tflite:
|
|
||||||
with open_remote(output_graph_path, "wb") as fout:
|
|
||||||
fout.write(frozen_graph.SerializeToString())
|
|
||||||
else:
|
|
||||||
output_tflite_path = os.path.join(
|
|
||||||
Config.export_dir, output_filename.replace(".pb", ".tflite")
|
|
||||||
)
|
|
||||||
|
|
||||||
converter = tf.lite.TFLiteConverter(
|
|
||||||
frozen_graph,
|
|
||||||
input_tensors=inputs.values(),
|
|
||||||
output_tensors=outputs.values(),
|
|
||||||
)
|
|
||||||
|
|
||||||
if Config.export_quantize:
|
|
||||||
converter.optimizations = [tf.lite.Optimize.DEFAULT]
|
|
||||||
|
|
||||||
# AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
|
|
||||||
converter.allow_custom_ops = True
|
|
||||||
tflite_model = converter.convert()
|
|
||||||
|
|
||||||
with open_remote(output_tflite_path, "wb") as fout:
|
|
||||||
fout.write(tflite_model)
|
|
||||||
|
|
||||||
log_info("Models exported at %s" % (Config.export_dir))
|
|
||||||
|
|
||||||
metadata_fname = os.path.join(
|
|
||||||
Config.export_dir,
|
|
||||||
"{}_{}_{}.md".format(
|
|
||||||
Config.export_author_id,
|
|
||||||
Config.export_model_name,
|
|
||||||
Config.export_model_version,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
model_runtime = "tflite" if Config.export_tflite else "tensorflow"
|
|
||||||
with open_remote(metadata_fname, "w") as f:
|
|
||||||
f.write("---\n")
|
|
||||||
f.write("author: {}\n".format(Config.export_author_id))
|
|
||||||
f.write("model_name: {}\n".format(Config.export_model_name))
|
|
||||||
f.write("model_version: {}\n".format(Config.export_model_version))
|
|
||||||
f.write("contact_info: {}\n".format(Config.export_contact_info))
|
|
||||||
f.write("license: {}\n".format(Config.export_license))
|
|
||||||
f.write("language: {}\n".format(Config.export_language))
|
|
||||||
f.write("runtime: {}\n".format(model_runtime))
|
|
||||||
f.write("min_stt_version: {}\n".format(Config.export_min_stt_version))
|
|
||||||
f.write("max_stt_version: {}\n".format(Config.export_max_stt_version))
|
|
||||||
f.write(
|
|
||||||
"acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n"
|
|
||||||
)
|
|
||||||
f.write(
|
|
||||||
"scorer_url: <replace this with a publicly available URL of the scorer, if present>\n"
|
|
||||||
)
|
|
||||||
f.write("---\n")
|
|
||||||
f.write("{}\n".format(Config.export_description))
|
|
||||||
|
|
||||||
log_info(
|
|
||||||
"Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.".format(
|
|
||||||
metadata_fname
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def package_zip():
|
|
||||||
# --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
|
|
||||||
export_dir = os.path.join(
|
|
||||||
os.path.abspath(Config.export_dir), ""
|
|
||||||
) # Force ending '/'
|
|
||||||
if is_remote_path(export_dir):
|
|
||||||
log_error(
|
|
||||||
"Cannot package remote path zip %s. Please do this manually." % export_dir
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
zip_filename = os.path.dirname(export_dir)
|
|
||||||
|
|
||||||
shutil.copy(Config.scorer_path, export_dir)
|
|
||||||
|
|
||||||
archive = shutil.make_archive(zip_filename, "zip", export_dir)
|
|
||||||
log_info("Exported packaged model {}".format(archive))
|
|
||||||
|
|
||||||
|
|
||||||
def do_single_file_inference(input_file_path):
|
|
||||||
tfv1.reset_default_graph()
|
|
||||||
|
|
||||||
with tfv1.Session(config=Config.session_config) as session:
|
|
||||||
inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
|
|
||||||
|
|
||||||
# Restore variables from training checkpoint
|
|
||||||
load_graph_for_evaluation(session)
|
|
||||||
|
|
||||||
features, features_len = audiofile_to_features(input_file_path)
|
|
||||||
previous_state_c = np.zeros([1, Config.n_cell_dim])
|
|
||||||
previous_state_h = np.zeros([1, Config.n_cell_dim])
|
|
||||||
|
|
||||||
# Add batch dimension
|
|
||||||
features = tf.expand_dims(features, 0)
|
|
||||||
features_len = tf.expand_dims(features_len, 0)
|
|
||||||
|
|
||||||
# Evaluate
|
|
||||||
features = create_overlapping_windows(features).eval(session=session)
|
|
||||||
features_len = features_len.eval(session=session)
|
|
||||||
|
|
||||||
probs = outputs["outputs"].eval(
|
|
||||||
feed_dict={
|
|
||||||
inputs["input"]: features,
|
|
||||||
inputs["input_lengths"]: features_len,
|
|
||||||
inputs["previous_state_c"]: previous_state_c,
|
|
||||||
inputs["previous_state_h"]: previous_state_h,
|
|
||||||
},
|
|
||||||
session=session,
|
|
||||||
)
|
|
||||||
|
|
||||||
probs = np.squeeze(probs)
|
|
||||||
|
|
||||||
if Config.scorer_path:
|
|
||||||
scorer = Scorer(
|
|
||||||
Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
scorer = None
|
|
||||||
decoded = ctc_beam_search_decoder(
|
|
||||||
probs,
|
|
||||||
Config.alphabet,
|
|
||||||
Config.beam_width,
|
|
||||||
scorer=scorer,
|
|
||||||
cutoff_prob=Config.cutoff_prob,
|
|
||||||
cutoff_top_n=Config.cutoff_top_n,
|
|
||||||
)
|
|
||||||
# Print highest probability result
|
|
||||||
print(decoded[0][1])
|
|
||||||
|
|
||||||
|
|
||||||
def early_training_checks():
|
def early_training_checks():
|
||||||
# Check for proper scorer early
|
# Check for proper scorer early
|
||||||
if Config.scorer_path:
|
if Config.scorer_path:
|
||||||
|
@ -1289,36 +684,47 @@ def early_training_checks():
|
||||||
)
|
)
|
||||||
|
|
||||||
if not Config.alphabet_config_path and not Config.bytes_output_mode:
|
if not Config.alphabet_config_path and not Config.bytes_output_mode:
|
||||||
log_error("Missing --alphabet_config_path flag, can't continue")
|
raise RuntimeError("Missing --alphabet_config_path flag, can't continue")
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
initialize_globals_from_cli()
|
initialize_globals_from_cli()
|
||||||
|
|
||||||
|
def deprecated_msg(prefix):
|
||||||
|
return (
|
||||||
|
f"{prefix} Using the training script as a generic driver for all training "
|
||||||
|
"related functionality is deprecated and will be removed soon. Use "
|
||||||
|
"the specific scripts: train.py/evaluate.py/export.py/training_graph_inference.py."
|
||||||
|
)
|
||||||
|
|
||||||
if Config.train_files:
|
if Config.train_files:
|
||||||
train()
|
train()
|
||||||
|
else:
|
||||||
|
log_warn(deprecated_msg("Calling training script without --train_files."))
|
||||||
|
|
||||||
if Config.test_files:
|
if Config.test_files:
|
||||||
test()
|
log_warn(
|
||||||
|
deprecated_msg(
|
||||||
if Config.export_dir and not Config.export_zip:
|
"Specifying --test_files when calling train.py script. Use evaluate.py."
|
||||||
export()
|
|
||||||
|
|
||||||
if Config.export_zip:
|
|
||||||
Config.export_tflite = True
|
|
||||||
|
|
||||||
if listdir_remote(Config.export_dir):
|
|
||||||
log_error(
|
|
||||||
"Directory {} is not empty, please fix this.".format(Config.export_dir)
|
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
)
|
||||||
|
evaluate.test()
|
||||||
|
|
||||||
export()
|
if Config.export_dir:
|
||||||
package_zip()
|
log_warn(
|
||||||
|
deprecated_msg(
|
||||||
|
"Specifying --export_dir when calling train.py script. Use export.py."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
export.export()
|
||||||
|
|
||||||
if Config.one_shot_infer:
|
if Config.one_shot_infer:
|
||||||
do_single_file_inference(Config.one_shot_infer)
|
log_warn(
|
||||||
|
deprecated_msg(
|
||||||
|
"Specifying --one_shot_infer when calling train.py script. Use training_graph_inference.py."
|
||||||
|
)
|
||||||
|
)
|
||||||
|
traning_graph_inference.do_single_file_inference(Config.one_shot_infer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
LOG_LEVEL_INDEX = sys.argv.index("--log_level") + 1 if "--log_level" in sys.argv else 0
|
||||||
|
DESIRED_LOG_LEVEL = (
|
||||||
|
sys.argv[LOG_LEVEL_INDEX] if 0 < LOG_LEVEL_INDEX < len(sys.argv) else "3"
|
||||||
|
)
|
||||||
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = DESIRED_LOG_LEVEL
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow.compat.v1 as tfv1
|
||||||
|
|
||||||
|
from coqui_stt_ctcdecoder import ctc_beam_search_decoder, Scorer
|
||||||
|
from .deepspeech_model import create_inference_graph, create_overlapping_windows
|
||||||
|
from .util.checkpoints import load_graph_for_evaluation
|
||||||
|
from .util.config import Config, initialize_globals_from_cli, log_error
|
||||||
|
from .util.feeding import audiofile_to_features
|
||||||
|
|
||||||
|
|
||||||
|
def do_single_file_inference(input_file_path):
|
||||||
|
tfv1.reset_default_graph()
|
||||||
|
|
||||||
|
with tfv1.Session(config=Config.session_config) as session:
|
||||||
|
inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1)
|
||||||
|
|
||||||
|
# Restore variables from training checkpoint
|
||||||
|
load_graph_for_evaluation(session)
|
||||||
|
|
||||||
|
features, features_len = audiofile_to_features(input_file_path)
|
||||||
|
previous_state_c = np.zeros([1, Config.n_cell_dim])
|
||||||
|
previous_state_h = np.zeros([1, Config.n_cell_dim])
|
||||||
|
|
||||||
|
# Add batch dimension
|
||||||
|
features = tf.expand_dims(features, 0)
|
||||||
|
features_len = tf.expand_dims(features_len, 0)
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
features = create_overlapping_windows(features).eval(session=session)
|
||||||
|
features_len = features_len.eval(session=session)
|
||||||
|
|
||||||
|
probs = outputs["outputs"].eval(
|
||||||
|
feed_dict={
|
||||||
|
inputs["input"]: features,
|
||||||
|
inputs["input_lengths"]: features_len,
|
||||||
|
inputs["previous_state_c"]: previous_state_c,
|
||||||
|
inputs["previous_state_h"]: previous_state_h,
|
||||||
|
},
|
||||||
|
session=session,
|
||||||
|
)
|
||||||
|
|
||||||
|
probs = np.squeeze(probs)
|
||||||
|
|
||||||
|
if Config.scorer_path:
|
||||||
|
scorer = Scorer(
|
||||||
|
Config.lm_alpha, Config.lm_beta, Config.scorer_path, Config.alphabet
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
scorer = None
|
||||||
|
decoded = ctc_beam_search_decoder(
|
||||||
|
probs,
|
||||||
|
Config.alphabet,
|
||||||
|
Config.beam_width,
|
||||||
|
scorer=scorer,
|
||||||
|
cutoff_prob=Config.cutoff_prob,
|
||||||
|
cutoff_top_n=Config.cutoff_top_n,
|
||||||
|
)
|
||||||
|
# Print highest probability result
|
||||||
|
print(decoded[0][1])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
initialize_globals_from_cli()
|
||||||
|
|
||||||
|
if Config.one_shot_infer:
|
||||||
|
tfv1.reset_default_graph()
|
||||||
|
do_single_file_inference(Config.one_shot_infer)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Calling training_graph_inference script directly but no --one_shot_infer input audio file specified"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -477,7 +477,7 @@ class _SttConfig(Coqpit):
|
||||||
default=False, metadata=dict(help="whether to remove old exported models")
|
default=False, metadata=dict(help="whether to remove old exported models")
|
||||||
)
|
)
|
||||||
export_tflite: bool = field(
|
export_tflite: bool = field(
|
||||||
default=False, metadata=dict(help="export a graph ready for TF Lite engine")
|
default=True, metadata=dict(help="export a graph ready for TF Lite engine")
|
||||||
)
|
)
|
||||||
export_quantize: bool = field(
|
export_quantize: bool = field(
|
||||||
default=True,
|
default=True,
|
||||||
|
|
|
@ -90,3 +90,10 @@ def remove_remote(filename):
|
||||||
"""
|
"""
|
||||||
# Conditional import
|
# Conditional import
|
||||||
return gfile.remove(filename)
|
return gfile.remove(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def rmtree_remote(foldername):
|
||||||
|
"""
|
||||||
|
Wrapper that can remove local and remote directories like `gs://...`
|
||||||
|
"""
|
||||||
|
return gfile.rmtree(foldername)
|
||||||
|
|
Loading…
Reference in New Issue