Feature caching
This commit is contained in:
parent
5ff3b31bbd
commit
44e502e236
@ -25,6 +25,7 @@ from tensorflow.python.tools import freeze_graph
|
||||
from threading import Thread, Lock
|
||||
from util.audio import audiofile_to_input_vector
|
||||
from util.feeding import DataSet, ModelFeeder
|
||||
from util.preprocess import preprocess
|
||||
from util.gpu import get_available_gpus
|
||||
from util.shared_lib import check_cupti
|
||||
from util.text import sparse_tensor_value_to_texts, wer, levenshtein, Alphabet, ndarray_to_text
|
||||
@ -40,6 +41,10 @@ def create_flags():
|
||||
tf.app.flags.DEFINE_string ('test_files', '', 'comma separated list of files specifying the dataset used for testing. multiple files will get merged')
|
||||
tf.app.flags.DEFINE_boolean ('fulltrace', False, 'if full trace debug info should be generated during training')
|
||||
|
||||
tf.app.flags.DEFINE_string ('train_cached_features_path', '', 'comma separated list of files specifying the dataset used for training. multiple files will get merged')
|
||||
tf.app.flags.DEFINE_string ('dev_cached_features_path', '', 'comma separated list of files specifying the dataset used for validation. multiple files will get merged')
|
||||
tf.app.flags.DEFINE_string ('test_cached_features_path', '', 'comma separated list of files specifying the dataset used for testing. multiple files will get merged')
|
||||
|
||||
# Cluster configuration
|
||||
# =====================
|
||||
|
||||
@ -402,7 +407,7 @@ def BiRNN(batch_x, seq_length, dropout, reuse=False, batch_size=None, n_steps=-1
|
||||
# This is done to prepare the batch for input into the first layer which expects a tensor of rank `2`.
|
||||
|
||||
# Permute n_steps and batch_size
|
||||
batch_x = tf.transpose(batch_x, [1, 0, 2])
|
||||
batch_x = tf.transpose(batch_x, [1, 0, 2, 3])
|
||||
# Reshape to prepare input for first layer
|
||||
batch_x = tf.reshape(batch_x, [-1, n_input + 2*n_input*n_context]) # (n_steps*batch_size, n_input + 2*n_input*n_context)
|
||||
layers['input_reshaped'] = batch_x
|
||||
@ -1459,19 +1464,40 @@ def train(server=None):
|
||||
global_step = tf.Variable(0, trainable=False, name='global_step')
|
||||
|
||||
# Reading training set
|
||||
train_set = DataSet(FLAGS.train_files.split(','),
|
||||
train_data = preprocess(FLAGS.train_files.split(','),
|
||||
FLAGS.train_batch_size,
|
||||
n_input,
|
||||
n_context,
|
||||
alphabet,
|
||||
hdf5_cache_path=FLAGS.train_cached_features_path)
|
||||
|
||||
train_set = DataSet(train_data,
|
||||
FLAGS.train_batch_size,
|
||||
limit=FLAGS.limit_train,
|
||||
next_index=lambda i: COORD.get_next_index('train'))
|
||||
|
||||
# Reading validation set
|
||||
dev_set = DataSet(FLAGS.dev_files.split(','),
|
||||
dev_data = preprocess(FLAGS.dev_files.split(','),
|
||||
FLAGS.dev_batch_size,
|
||||
n_input,
|
||||
n_context,
|
||||
alphabet,
|
||||
hdf5_cache_path=FLAGS.dev_cached_features_path)
|
||||
|
||||
dev_set = DataSet(dev_data,
|
||||
FLAGS.dev_batch_size,
|
||||
limit=FLAGS.limit_dev,
|
||||
next_index=lambda i: COORD.get_next_index('dev'))
|
||||
|
||||
# Reading test set
|
||||
test_set = DataSet(FLAGS.test_files.split(','),
|
||||
test_data = preprocess(FLAGS.test_files.split(','),
|
||||
FLAGS.test_batch_size,
|
||||
n_input,
|
||||
n_context,
|
||||
alphabet,
|
||||
hdf5_cache_path=FLAGS.test_cached_features_path)
|
||||
|
||||
test_set = DataSet(test_data,
|
||||
FLAGS.test_batch_size,
|
||||
limit=FLAGS.limit_test,
|
||||
next_index=lambda i: COORD.get_next_index('test'))
|
||||
|
||||
83
evaluate.py
83
evaluate.py
@ -19,6 +19,7 @@ from multiprocessing import Pool
|
||||
from six.moves import zip, range
|
||||
from util.audio import audiofile_to_input_vector
|
||||
from util.text import sparse_tensor_value_to_texts, text_to_char_array, Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
|
||||
from util.preprocess import pmap, preprocess
|
||||
|
||||
|
||||
FLAGS = tf.app.flags.FLAGS
|
||||
@ -28,88 +29,6 @@ N_FEATURES = 26
|
||||
N_CONTEXT = 9
|
||||
|
||||
|
||||
def pmap(fun, iterable, threads=8):
|
||||
pool = Pool(threads)
|
||||
results = pool.map(fun, iterable)
|
||||
pool.close()
|
||||
return results
|
||||
|
||||
|
||||
def process_single_file(row):
|
||||
# row = index, Series
|
||||
_, file = row
|
||||
features = audiofile_to_input_vector(file.wav_filename, N_FEATURES, N_CONTEXT)
|
||||
transcript = text_to_char_array(file.transcript, alphabet)
|
||||
|
||||
return features, len(features), transcript, len(transcript)
|
||||
|
||||
|
||||
# load samples from CSV, compute features, optionally cache results on disk
|
||||
def preprocess(dataset_files, batch_size, hdf5_dest_path=None):
|
||||
COLUMNS = ('features', 'features_len', 'transcript', 'transcript_len')
|
||||
|
||||
if hdf5_dest_path and os.path.exists(hdf5_dest_path):
|
||||
with tables.open_file(hdf5_dest_path, 'r') as file:
|
||||
features = file.root.features[:]
|
||||
features_len = file.root.features_len[:]
|
||||
transcript = file.root.transcript[:]
|
||||
transcript_len = file.root.transcript_len[:]
|
||||
|
||||
# features are stored flattened, so reshape into
|
||||
# [n_steps, (n_input + 2*n_context*n_input)]
|
||||
for i in range(len(features)):
|
||||
features[i] = np.reshape(features[i], [features_len[i], -1])
|
||||
|
||||
in_data = list(zip(features, features_len,
|
||||
transcript, transcript_len))
|
||||
return pandas.DataFrame(data=in_data, columns=COLUMNS)
|
||||
|
||||
csv_files = dataset_files.split(',')
|
||||
source_data = None
|
||||
for csv in csv_files:
|
||||
file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
|
||||
if source_data is None:
|
||||
source_data = file
|
||||
else:
|
||||
source_data = source_data.append(file)
|
||||
|
||||
# discard last samples if dataset does not divide batch size evenly
|
||||
if len(source_data) % batch_size != 0:
|
||||
source_data = source_data[:-(len(source_data) % batch_size)]
|
||||
|
||||
out_data = pmap(process_single_file, source_data.iterrows())
|
||||
|
||||
if hdf5_dest_path:
|
||||
# list of tuples -> tuple of lists
|
||||
features, features_len, transcript, transcript_len = zip(*out_data)
|
||||
|
||||
with tables.open_file(hdf5_dest_path, 'w') as file:
|
||||
features_dset = file.create_vlarray(file.root,
|
||||
'features',
|
||||
tables.Float32Atom(),
|
||||
filters=tables.Filters(complevel=1))
|
||||
# VLArray atoms need to be 1D, so flatten feature array
|
||||
for f in features:
|
||||
features_dset.append(np.reshape(f, -1))
|
||||
|
||||
features_len_dset = file.create_array(file.root,
|
||||
'features_len',
|
||||
features_len)
|
||||
|
||||
transcript_dset = file.create_vlarray(file.root,
|
||||
'transcript',
|
||||
tables.Int32Atom(),
|
||||
filters=tables.Filters(complevel=1))
|
||||
for t in transcript:
|
||||
transcript_dset.append(t)
|
||||
|
||||
transcript_len_dset = file.create_array(file.root,
|
||||
'transcript_len',
|
||||
transcript_len)
|
||||
|
||||
return pandas.DataFrame(data=out_data, columns=COLUMNS)
|
||||
|
||||
|
||||
def split_data(dataset, batch_size):
|
||||
remainder = len(dataset) % batch_size
|
||||
if remainder != 0:
|
||||
|
||||
@ -13,3 +13,4 @@ pyxdg
|
||||
bs4
|
||||
six
|
||||
requests
|
||||
tables
|
||||
|
||||
@ -1,53 +1,7 @@
|
||||
from __future__ import absolute_import, print_function
|
||||
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
class DeepSpeechDeprecationWarning(DeprecationWarning):
|
||||
pass
|
||||
|
||||
warnings.simplefilter('once', category=DeepSpeechDeprecationWarning)
|
||||
|
||||
try:
|
||||
from deepspeech import audioToInputVector
|
||||
except ImportError:
|
||||
warnings.warn('DeepSpeech Python bindings could not be imported, resorting to slower code to compute audio features. '
|
||||
'Refer to README.md for instructions on how to install (or build) the DeepSpeech Python bindings.',
|
||||
category=DeepSpeechDeprecationWarning)
|
||||
|
||||
import numpy as np
|
||||
from python_speech_features import mfcc
|
||||
from six.moves import range
|
||||
|
||||
def audioToInputVector(audio, fs, numcep, numcontext):
|
||||
# Get mfcc coefficients
|
||||
features = mfcc(audio, samplerate=fs, numcep=numcep)
|
||||
|
||||
# We only keep every second feature (BiRNN stride = 2)
|
||||
features = features[::2]
|
||||
|
||||
# One stride per time step in the input
|
||||
num_strides = len(features)
|
||||
|
||||
# Add empty initial and final contexts
|
||||
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
|
||||
features = np.concatenate((empty_context, features, empty_context))
|
||||
|
||||
# Create a view into the array with overlapping strides of size
|
||||
# numcontext (past) + 1 (present) + numcontext (future)
|
||||
window_size = 2*numcontext+1
|
||||
train_inputs = np.lib.stride_tricks.as_strided(
|
||||
features,
|
||||
(num_strides, window_size, numcep),
|
||||
(features.strides[0], features.strides[0], features.strides[1]),
|
||||
writeable=False)
|
||||
|
||||
# Flatten the second and third dimensions
|
||||
train_inputs = np.reshape(train_inputs, [num_strides, -1])
|
||||
|
||||
# Return results
|
||||
return train_inputs
|
||||
from python_speech_features import mfcc
|
||||
|
||||
|
||||
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
|
||||
@ -60,4 +14,14 @@ def audiofile_to_input_vector(audio_filename, numcep, numcontext):
|
||||
# Load wav files
|
||||
fs, audio = wav.read(audio_filename)
|
||||
|
||||
return audioToInputVector(audio, fs, numcep, numcontext)
|
||||
# Get mfcc coefficients
|
||||
features = mfcc(audio, samplerate=fs, numcep=numcep)
|
||||
|
||||
# We only keep every second feature (BiRNN stride = 2)
|
||||
features = features[::2]
|
||||
|
||||
# Add empty initial and final contexts
|
||||
empty_context = np.zeros((numcontext, numcep), dtype=features.dtype)
|
||||
features = np.concatenate((empty_context, features, empty_context))
|
||||
|
||||
return features
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
import pandas
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from threading import Thread
|
||||
from math import ceil
|
||||
from six.moves import range
|
||||
from util.audio import audiofile_to_input_vector
|
||||
from threading import Thread
|
||||
from util.gpu import get_available_gpus
|
||||
from util.text import ctc_label_dense_to_sparse, text_to_char_array
|
||||
from util.text import ctc_label_dense_to_sparse
|
||||
|
||||
|
||||
class ModelFeeder(object):
|
||||
'''
|
||||
@ -24,7 +24,7 @@ class ModelFeeder(object):
|
||||
numcontext,
|
||||
alphabet,
|
||||
tower_feeder_count=-1,
|
||||
threads_per_queue=2):
|
||||
threads_per_queue=4):
|
||||
|
||||
self.train = train_set
|
||||
self.dev = dev_set
|
||||
@ -35,7 +35,7 @@ class ModelFeeder(object):
|
||||
self.tower_feeder_count = max(len(get_available_gpus()), 1) if tower_feeder_count < 0 else tower_feeder_count
|
||||
self.threads_per_queue = threads_per_queue
|
||||
|
||||
self.ph_x = tf.placeholder(tf.float32, [None, numcep + (2 * numcep * numcontext)])
|
||||
self.ph_x = tf.placeholder(tf.float32, [None, 2*numcontext+1, numcep])
|
||||
self.ph_x_length = tf.placeholder(tf.int32, [])
|
||||
self.ph_y = tf.placeholder(tf.int32, [None,])
|
||||
self.ph_y_length = tf.placeholder(tf.int32, [])
|
||||
@ -77,27 +77,19 @@ class ModelFeeder(object):
|
||||
'''
|
||||
return self._tower_feeders[tower_feeder_index].next_batch()
|
||||
|
||||
|
||||
class DataSet(object):
|
||||
'''
|
||||
Represents a collection of audio samples and their respective transcriptions.
|
||||
Takes a set of CSV files produced by importers in /bin.
|
||||
'''
|
||||
def __init__(self, csvs, batch_size, skip=0, limit=0, ascending=True, next_index=lambda i: i + 1):
|
||||
def __init__(self, data, batch_size, skip=0, limit=0, ascending=True, next_index=lambda i: i + 1):
|
||||
self.data = data
|
||||
self.data.sort_values(by="features_len", ascending=ascending, inplace=True)
|
||||
self.batch_size = batch_size
|
||||
self.next_index = next_index
|
||||
self.files = None
|
||||
for csv in csvs:
|
||||
file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
|
||||
if self.files is None:
|
||||
self.files = file
|
||||
else:
|
||||
self.files = self.files.append(file)
|
||||
self.files = self.files.sort_values(by="wav_filesize", ascending=ascending) \
|
||||
.ix[:, ["wav_filename", "transcript"]] \
|
||||
.values[skip:]
|
||||
if limit > 0:
|
||||
self.files = self.files[:limit]
|
||||
self.total_batches = int(ceil(len(self.files) / batch_size))
|
||||
self.total_batches = int(ceil(len(self.data) / batch_size))
|
||||
|
||||
|
||||
class _DataSetLoader(object):
|
||||
'''
|
||||
@ -109,9 +101,9 @@ class _DataSetLoader(object):
|
||||
def __init__(self, model_feeder, data_set, alphabet):
|
||||
self._model_feeder = model_feeder
|
||||
self._data_set = data_set
|
||||
self.queue = tf.PaddingFIFOQueue(shapes=[[None, model_feeder.numcep + (2 * model_feeder.numcep * model_feeder.numcontext)], [], [None,], []],
|
||||
self.queue = tf.PaddingFIFOQueue(shapes=[[None, 2 * model_feeder.numcontext + 1, model_feeder.numcep], [], [None,], []],
|
||||
dtypes=[tf.float32, tf.int32, tf.int32, tf.int32],
|
||||
capacity=data_set.batch_size * 2)
|
||||
capacity=data_set.batch_size * 8)
|
||||
self._enqueue_op = self.queue.enqueue([model_feeder.ph_x, model_feeder.ph_x_length, model_feeder.ph_y, model_feeder.ph_y_length])
|
||||
self._close_op = self.queue.close(cancel_pending_enqueues=True)
|
||||
self._alphabet = alphabet
|
||||
@ -138,25 +130,35 @@ class _DataSetLoader(object):
|
||||
'''
|
||||
Queue thread routine.
|
||||
'''
|
||||
file_count = len(self._data_set.files)
|
||||
file_count = len(self._data_set.data)
|
||||
index = -1
|
||||
while not coord.should_stop():
|
||||
index = self._data_set.next_index(index) % file_count
|
||||
wav_file, transcript = self._data_set.files[index]
|
||||
source = audiofile_to_input_vector(wav_file, self._model_feeder.numcep, self._model_feeder.numcontext)
|
||||
source_len = len(source)
|
||||
target = text_to_char_array(transcript, self._alphabet)
|
||||
target_len = len(target)
|
||||
if source_len < target_len:
|
||||
raise ValueError('Error: Audio file {} is too short for transcription.'.format(wav_file))
|
||||
features, _, transcript, transcript_len = self._data_set.data.iloc[index]
|
||||
|
||||
# One stride per time step in the input
|
||||
num_strides = len(features) - (self._model_feeder.numcontext * 2)
|
||||
|
||||
# Create a view into the array with overlapping strides of size
|
||||
# numcontext (past) + 1 (present) + numcontext (future)
|
||||
window_size = 2*self._model_feeder.numcontext+1
|
||||
features = np.lib.stride_tricks.as_strided(
|
||||
features,
|
||||
(num_strides, window_size, self._model_feeder.numcep),
|
||||
(features.strides[0], features.strides[0], features.strides[1]),
|
||||
writeable=False)
|
||||
|
||||
try:
|
||||
session.run(self._enqueue_op, feed_dict={ self._model_feeder.ph_x: source,
|
||||
self._model_feeder.ph_x_length: source_len,
|
||||
self._model_feeder.ph_y: target,
|
||||
self._model_feeder.ph_y_length: target_len })
|
||||
session.run(self._enqueue_op, feed_dict={
|
||||
self._model_feeder.ph_x: features,
|
||||
self._model_feeder.ph_x_length: num_strides,
|
||||
self._model_feeder.ph_y: transcript,
|
||||
self._model_feeder.ph_y_length: transcript_len
|
||||
})
|
||||
except tf.errors.CancelledError:
|
||||
return
|
||||
|
||||
|
||||
class _TowerFeeder(object):
|
||||
'''
|
||||
Internal class that represents a switchable input queue for one tower.
|
||||
|
||||
98
util/preprocess.py
Normal file
98
util/preprocess.py
Normal file
@ -0,0 +1,98 @@
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas
|
||||
import tables
|
||||
|
||||
from functools import partial
|
||||
from multiprocessing.dummy import Pool
|
||||
from util.audio import audiofile_to_input_vector
|
||||
from util.text import text_to_char_array
|
||||
|
||||
def pmap(fun, iterable, threads=8):
|
||||
pool = Pool(threads)
|
||||
results = pool.map(fun, iterable)
|
||||
pool.close()
|
||||
return results
|
||||
|
||||
|
||||
def process_single_file(row, numcep, numcontext, alphabet):
|
||||
# row = index, Series
|
||||
_, file = row
|
||||
features = audiofile_to_input_vector(file.wav_filename, numcep, numcontext)
|
||||
transcript = text_to_char_array(file.transcript, alphabet)
|
||||
|
||||
if (2*numcontext + len(features)) < len(transcript):
|
||||
raise ValueError('Error: Audio file {} is too short for transcription.'.format(file.wav_filename))
|
||||
|
||||
return features, len(features), transcript, len(transcript)
|
||||
|
||||
|
||||
# load samples from CSV, compute features, optionally cache results on disk
|
||||
def preprocess(csv_files, batch_size, numcep, numcontext, alphabet, hdf5_cache_path=None):
|
||||
COLUMNS = ('features', 'features_len', 'transcript', 'transcript_len')
|
||||
|
||||
print('Preprocessing', csv_files)
|
||||
|
||||
if hdf5_cache_path and os.path.exists(hdf5_cache_path):
|
||||
with tables.open_file(hdf5_cache_path, 'r') as file:
|
||||
features = file.root.features[:]
|
||||
features_len = file.root.features_len[:]
|
||||
transcript = file.root.transcript[:]
|
||||
transcript_len = file.root.transcript_len[:]
|
||||
|
||||
# features are stored flattened, so reshape into
|
||||
# [n_steps, (n_input + 2*n_context*n_input)]
|
||||
for i in range(len(features)):
|
||||
features[i] = np.reshape(features[i], [features_len[i], -1])
|
||||
|
||||
in_data = list(zip(features, features_len,
|
||||
transcript, transcript_len))
|
||||
print('Loaded from cache at', hdf5_cache_path)
|
||||
return pandas.DataFrame(data=in_data, columns=COLUMNS)
|
||||
|
||||
source_data = None
|
||||
for csv in csv_files:
|
||||
file = pandas.read_csv(csv, encoding='utf-8', na_filter=False)
|
||||
if source_data is None:
|
||||
source_data = file
|
||||
else:
|
||||
source_data = source_data.append(file)
|
||||
|
||||
step_fn = partial(process_single_file,
|
||||
numcep=numcep,
|
||||
numcontext=numcontext,
|
||||
alphabet=alphabet)
|
||||
out_data = pmap(step_fn, source_data.iterrows())
|
||||
|
||||
if hdf5_cache_path:
|
||||
print('Saving to', hdf5_cache_path)
|
||||
|
||||
# list of tuples -> tuple of lists
|
||||
features, features_len, transcript, transcript_len = zip(*out_data)
|
||||
|
||||
with tables.open_file(hdf5_cache_path, 'w') as file:
|
||||
features_dset = file.create_vlarray(file.root,
|
||||
'features',
|
||||
tables.Float32Atom(),
|
||||
filters=tables.Filters(complevel=1))
|
||||
# VLArray atoms need to be 1D, so flatten feature array
|
||||
for f in features:
|
||||
features_dset.append(np.reshape(f, -1))
|
||||
|
||||
features_len_dset = file.create_array(file.root,
|
||||
'features_len',
|
||||
features_len)
|
||||
|
||||
transcript_dset = file.create_vlarray(file.root,
|
||||
'transcript',
|
||||
tables.Int32Atom(),
|
||||
filters=tables.Filters(complevel=1))
|
||||
for t in transcript:
|
||||
transcript_dset.append(t)
|
||||
|
||||
transcript_len_dset = file.create_array(file.root,
|
||||
'transcript_len',
|
||||
transcript_len)
|
||||
|
||||
print('Preprocessing done')
|
||||
return pandas.DataFrame(data=out_data, columns=COLUMNS)
|
||||
Loading…
x
Reference in New Issue
Block a user