Merge pull request #38 from mozilla/issue6

WIP Integration of CTC
This commit is contained in:
Kelly Davis 2016-09-20 10:12:13 +02:00 committed by GitHub
commit 8d6f188eb6
14 changed files with 5451 additions and 957 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
.ipynb_checkpoints
*.pyc

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
0 46797 She had your dark suit in greasy wash water all year.

BIN
data/smoke_test/LDC93S1.wav Normal file

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 167 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 186 KiB

0
util/__init__.py Normal file
View File

80
util/audio/__init__.py Normal file
View File

@ -0,0 +1,80 @@
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc
def audiofiles_to_audio_data_sets(audio_filenames, numcep, numcontext):
# Define audio_data_sets to return
inputs = []
input_seq_lens = []
# Loop over audio_filenames
for audio_filename in audio_filenames:
# Load wav files
fs, audio = wav.read(audio_filename)
# Get mfcc coefficients
orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
# For each time slice of the training set, we need to copy the context this makes
# the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
# because of:
# - numcep dimensions for the current mfcc feature set
# - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
# => so numcep + 2*numcontext*numcep
train_inputs = np.array([], np.float)
train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
# Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
empty_mfcc = np.array([])
empty_mfcc.resize((numcep))
# Prepare train_inputs with past and future contexts
time_slices = range(train_inputs.shape[0])
context_past_min = time_slices[0] + numcontext
context_future_max = time_slices[-1] - numcontext
for time_slice in time_slices:
### Reminder: array[start:stop:step]
### slices from indice |start| up to |stop| (not included), every |step|
# Pick up to numcontext time slices in the past, and complete with empty
# mfcc features
need_empty_past = max(0, (context_past_min - time_slice))
empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
assert(len(empty_source_past) + len(data_source_past) == numcontext)
# Pick up to numcontext time slices in the future, and complete with empty
# mfcc features
need_empty_future = max(0, (time_slice - context_future_max))
empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
assert(len(empty_source_future) + len(data_source_future) == numcontext)
if need_empty_past:
past = np.concatenate((empty_source_past, data_source_past))
else:
past = data_source_past
if need_empty_future:
future = np.concatenate((data_source_future, empty_source_future))
else:
future = data_source_future
past = np.reshape(past, numcontext*numcep)
now = orig_inputs[time_slice]
future = np.reshape(future, numcontext*numcep)
train_inputs[time_slice] = np.concatenate((past, now, future))
assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
# Whiten inputs (TODO: Should we whiten)
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
# Obtain array of sequence lengths
input_seq_lens.append(train_inputs.shape[0])
# Convert train_inputs to proper form
inputs.append(train_inputs)
# Return results
return (np.asarray(inputs), input_seq_lens)

View File

View File

@ -0,0 +1,86 @@
import numpy as np
from os import path
from util.text import text_to_sparse_tuple
from util.audio import audiofiles_to_audio_data_sets
class DataSets(object):
def __init__(self, train, validation, test):
self._train = train
self._validation = validation
self._test = test
@property
def train(self):
return self._train
@property
def validation(self):
return self._validation
@property
def test(self):
return self._test
class DataSet(object):
def __init__(self, inputs, outputs, seq_len):
self._offset = 0
self._inputs = inputs
self._outputs = outputs
self._seq_len = seq_len
def next_batch(self, batch_size):
next_batch = (self._inputs, self._outputs, self._seq_len) # TODO: Choose only batch_size elements
self._offset += batch_size
return next_batch
@property
def max_batch_seq_len(self):
return np.amax(self._seq_len)
@property
def num_examples(self):
return self._inputs.shape[0]
def read_data_sets(data_dir, numcep, numcontext):
# Get train data
train_outputs = read_text_data_sets(data_dir, 'train')
train_inputs, train_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'train')
# Get validation data
validation_outputs = read_text_data_sets(data_dir, 'validation')
validation_inputs, validation_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'validation')
# Get test data
test_outputs = read_text_data_sets(data_dir, 'test')
test_inputs, test_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'test')
# Create train, validation, and test DataSet's
train = DataSet(inputs=train_inputs, outputs=train_outputs, seq_len=train_seq_len)
validation = DataSet(inputs=validation_inputs, outputs=validation_outputs, seq_len=validation_seq_len)
test = DataSet(inputs=test_inputs, outputs=test_outputs, seq_len=test_seq_len)
# Return DataSets
return DataSets(train=train, validation=validation, test=test)
def read_text_data_sets(data_dir, data_type):
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
# Create file names
text_filename = path.join(data_dir, 'LDC93S1.txt')
# Read text file and create list of sentence's words w/spaces replaced by ''
with open(text_filename, 'rb') as f:
for line in f.readlines():
original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
return text_to_sparse_tuple([original])
def read_audio_data_sets(data_dir, numcep, numcontext, data_type):
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
# Create file name
audio_filename = path.join(data_dir, 'LDC93S1.wav')
# Return properly formatted data
return audiofiles_to_audio_data_sets([audio_filename], numcep, numcontext)

50
util/text/__init__.py Normal file
View File

@ -0,0 +1,50 @@
import numpy as np
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1 # 0 is reserved to space
def text_to_sparse_tuple(originals):
# Define list to hold results
results = []
# Process each original in originals
for original in originals:
# Create list of sentence's words w/spaces replaced by ''
result = original.replace(' ', ' ')
result = result.split(' ')
# Tokenize words into letters adding in SPACE_TOKEN where required
result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
# Map characters into indicies
result = np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
# Add result to results
results.append(result)
# Creating sparse representation to feed the placeholder
return sparse_tuple_from(results)
def sparse_tuple_from(sequences, dtype=np.int32):
"""Create a sparse representention of x.
Args:
sequences: a list of lists of type dtype where each element is a sequence
Returns:
A tuple with (indices, values, shape)
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n]*len(seq), xrange(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
return indices, values, shape