commit
8d6f188eb6
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.ipynb_checkpoints
|
||||||
|
*.pyc
|
6187
DeepSpeech.ipynb
6187
DeepSpeech.ipynb
File diff suppressed because it is too large
Load Diff
1
data/smoke_test/LDC93S1.txt
Normal file
1
data/smoke_test/LDC93S1.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
0 46797 She had your dark suit in greasy wash water all year.
|
BIN
data/smoke_test/LDC93S1.wav
Normal file
BIN
data/smoke_test/LDC93S1.wav
Normal file
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 81 KiB |
Binary file not shown.
Before Width: | Height: | Size: 40 KiB |
Binary file not shown.
Before Width: | Height: | Size: 29 KiB |
Binary file not shown.
Before Width: | Height: | Size: 167 KiB |
Binary file not shown.
Before Width: | Height: | Size: 186 KiB |
0
util/__init__.py
Normal file
0
util/__init__.py
Normal file
80
util/audio/__init__.py
Normal file
80
util/audio/__init__.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import numpy as np
|
||||||
|
import scipy.io.wavfile as wav
|
||||||
|
|
||||||
|
from python_speech_features import mfcc
|
||||||
|
|
||||||
|
def audiofiles_to_audio_data_sets(audio_filenames, numcep, numcontext):
|
||||||
|
# Define audio_data_sets to return
|
||||||
|
inputs = []
|
||||||
|
input_seq_lens = []
|
||||||
|
|
||||||
|
# Loop over audio_filenames
|
||||||
|
for audio_filename in audio_filenames:
|
||||||
|
# Load wav files
|
||||||
|
fs, audio = wav.read(audio_filename)
|
||||||
|
|
||||||
|
# Get mfcc coefficients
|
||||||
|
orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
|
||||||
|
|
||||||
|
# For each time slice of the training set, we need to copy the context this makes
|
||||||
|
# the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
|
||||||
|
# because of:
|
||||||
|
# - numcep dimensions for the current mfcc feature set
|
||||||
|
# - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
|
||||||
|
# => so numcep + 2*numcontext*numcep
|
||||||
|
train_inputs = np.array([], np.float)
|
||||||
|
train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
|
||||||
|
|
||||||
|
# Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
|
||||||
|
empty_mfcc = np.array([])
|
||||||
|
empty_mfcc.resize((numcep))
|
||||||
|
|
||||||
|
# Prepare train_inputs with past and future contexts
|
||||||
|
time_slices = range(train_inputs.shape[0])
|
||||||
|
context_past_min = time_slices[0] + numcontext
|
||||||
|
context_future_max = time_slices[-1] - numcontext
|
||||||
|
for time_slice in time_slices:
|
||||||
|
### Reminder: array[start:stop:step]
|
||||||
|
### slices from indice |start| up to |stop| (not included), every |step|
|
||||||
|
# Pick up to numcontext time slices in the past, and complete with empty
|
||||||
|
# mfcc features
|
||||||
|
need_empty_past = max(0, (context_past_min - time_slice))
|
||||||
|
empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
|
||||||
|
data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
|
||||||
|
assert(len(empty_source_past) + len(data_source_past) == numcontext)
|
||||||
|
|
||||||
|
# Pick up to numcontext time slices in the future, and complete with empty
|
||||||
|
# mfcc features
|
||||||
|
need_empty_future = max(0, (time_slice - context_future_max))
|
||||||
|
empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
|
||||||
|
data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
|
||||||
|
assert(len(empty_source_future) + len(data_source_future) == numcontext)
|
||||||
|
|
||||||
|
if need_empty_past:
|
||||||
|
past = np.concatenate((empty_source_past, data_source_past))
|
||||||
|
else:
|
||||||
|
past = data_source_past
|
||||||
|
|
||||||
|
if need_empty_future:
|
||||||
|
future = np.concatenate((data_source_future, empty_source_future))
|
||||||
|
else:
|
||||||
|
future = data_source_future
|
||||||
|
|
||||||
|
past = np.reshape(past, numcontext*numcep)
|
||||||
|
now = orig_inputs[time_slice]
|
||||||
|
future = np.reshape(future, numcontext*numcep)
|
||||||
|
|
||||||
|
train_inputs[time_slice] = np.concatenate((past, now, future))
|
||||||
|
assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
|
||||||
|
|
||||||
|
# Whiten inputs (TODO: Should we whiten)
|
||||||
|
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
|
||||||
|
|
||||||
|
# Obtain array of sequence lengths
|
||||||
|
input_seq_lens.append(train_inputs.shape[0])
|
||||||
|
|
||||||
|
# Convert train_inputs to proper form
|
||||||
|
inputs.append(train_inputs)
|
||||||
|
|
||||||
|
# Return results
|
||||||
|
return (np.asarray(inputs), input_seq_lens)
|
0
util/importers/__init__.py
Normal file
0
util/importers/__init__.py
Normal file
86
util/importers/ted_lium/__init__.py
Normal file
86
util/importers/ted_lium/__init__.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from os import path
|
||||||
|
from util.text import text_to_sparse_tuple
|
||||||
|
from util.audio import audiofiles_to_audio_data_sets
|
||||||
|
|
||||||
|
class DataSets(object):
|
||||||
|
def __init__(self, train, validation, test):
|
||||||
|
self._train = train
|
||||||
|
self._validation = validation
|
||||||
|
self._test = test
|
||||||
|
|
||||||
|
@property
|
||||||
|
def train(self):
|
||||||
|
return self._train
|
||||||
|
|
||||||
|
@property
|
||||||
|
def validation(self):
|
||||||
|
return self._validation
|
||||||
|
|
||||||
|
@property
|
||||||
|
def test(self):
|
||||||
|
return self._test
|
||||||
|
|
||||||
|
class DataSet(object):
|
||||||
|
def __init__(self, inputs, outputs, seq_len):
|
||||||
|
self._offset = 0
|
||||||
|
self._inputs = inputs
|
||||||
|
self._outputs = outputs
|
||||||
|
self._seq_len = seq_len
|
||||||
|
|
||||||
|
def next_batch(self, batch_size):
|
||||||
|
next_batch = (self._inputs, self._outputs, self._seq_len) # TODO: Choose only batch_size elements
|
||||||
|
self._offset += batch_size
|
||||||
|
return next_batch
|
||||||
|
|
||||||
|
@property
|
||||||
|
def max_batch_seq_len(self):
|
||||||
|
return np.amax(self._seq_len)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_examples(self):
|
||||||
|
return self._inputs.shape[0]
|
||||||
|
|
||||||
|
|
||||||
|
def read_data_sets(data_dir, numcep, numcontext):
|
||||||
|
# Get train data
|
||||||
|
train_outputs = read_text_data_sets(data_dir, 'train')
|
||||||
|
train_inputs, train_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'train')
|
||||||
|
# Get validation data
|
||||||
|
validation_outputs = read_text_data_sets(data_dir, 'validation')
|
||||||
|
validation_inputs, validation_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'validation')
|
||||||
|
# Get test data
|
||||||
|
test_outputs = read_text_data_sets(data_dir, 'test')
|
||||||
|
test_inputs, test_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'test')
|
||||||
|
|
||||||
|
# Create train, validation, and test DataSet's
|
||||||
|
train = DataSet(inputs=train_inputs, outputs=train_outputs, seq_len=train_seq_len)
|
||||||
|
validation = DataSet(inputs=validation_inputs, outputs=validation_outputs, seq_len=validation_seq_len)
|
||||||
|
test = DataSet(inputs=test_inputs, outputs=test_outputs, seq_len=test_seq_len)
|
||||||
|
|
||||||
|
# Return DataSets
|
||||||
|
return DataSets(train=train, validation=validation, test=test)
|
||||||
|
|
||||||
|
|
||||||
|
def read_text_data_sets(data_dir, data_type):
|
||||||
|
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
|
||||||
|
|
||||||
|
# Create file names
|
||||||
|
text_filename = path.join(data_dir, 'LDC93S1.txt')
|
||||||
|
|
||||||
|
# Read text file and create list of sentence's words w/spaces replaced by ''
|
||||||
|
with open(text_filename, 'rb') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
|
||||||
|
|
||||||
|
return text_to_sparse_tuple([original])
|
||||||
|
|
||||||
|
def read_audio_data_sets(data_dir, numcep, numcontext, data_type):
|
||||||
|
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
|
||||||
|
|
||||||
|
# Create file name
|
||||||
|
audio_filename = path.join(data_dir, 'LDC93S1.wav')
|
||||||
|
|
||||||
|
# Return properly formatted data
|
||||||
|
return audiofiles_to_audio_data_sets([audio_filename], numcep, numcontext)
|
50
util/text/__init__.py
Normal file
50
util/text/__init__.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
SPACE_TOKEN = '<space>'
|
||||||
|
SPACE_INDEX = 0
|
||||||
|
FIRST_INDEX = ord('a') - 1 # 0 is reserved to space
|
||||||
|
|
||||||
|
|
||||||
|
def text_to_sparse_tuple(originals):
|
||||||
|
# Define list to hold results
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Process each original in originals
|
||||||
|
for original in originals:
|
||||||
|
# Create list of sentence's words w/spaces replaced by ''
|
||||||
|
result = original.replace(' ', ' ')
|
||||||
|
result = result.split(' ')
|
||||||
|
|
||||||
|
# Tokenize words into letters adding in SPACE_TOKEN where required
|
||||||
|
result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
|
||||||
|
|
||||||
|
# Map characters into indicies
|
||||||
|
result = np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
|
||||||
|
|
||||||
|
# Add result to results
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Creating sparse representation to feed the placeholder
|
||||||
|
return sparse_tuple_from(results)
|
||||||
|
|
||||||
|
|
||||||
|
def sparse_tuple_from(sequences, dtype=np.int32):
|
||||||
|
"""Create a sparse representention of x.
|
||||||
|
Args:
|
||||||
|
sequences: a list of lists of type dtype where each element is a sequence
|
||||||
|
Returns:
|
||||||
|
A tuple with (indices, values, shape)
|
||||||
|
"""
|
||||||
|
indices = []
|
||||||
|
values = []
|
||||||
|
|
||||||
|
for n, seq in enumerate(sequences):
|
||||||
|
indices.extend(zip([n]*len(seq), xrange(len(seq))))
|
||||||
|
values.extend(seq)
|
||||||
|
|
||||||
|
indices = np.asarray(indices, dtype=np.int64)
|
||||||
|
values = np.asarray(values, dtype=dtype)
|
||||||
|
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
|
||||||
|
|
||||||
|
return indices, values, shape
|
Loading…
Reference in New Issue
Block a user