commit
8d6f188eb6
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.ipynb_checkpoints
|
||||
*.pyc
|
6189
DeepSpeech.ipynb
6189
DeepSpeech.ipynb
File diff suppressed because it is too large
Load Diff
1
data/smoke_test/LDC93S1.txt
Normal file
1
data/smoke_test/LDC93S1.txt
Normal file
@ -0,0 +1 @@
|
||||
0 46797 She had your dark suit in greasy wash water all year.
|
BIN
data/smoke_test/LDC93S1.wav
Normal file
BIN
data/smoke_test/LDC93S1.wav
Normal file
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 81 KiB |
Binary file not shown.
Before Width: | Height: | Size: 40 KiB |
Binary file not shown.
Before Width: | Height: | Size: 29 KiB |
Binary file not shown.
Before Width: | Height: | Size: 167 KiB |
Binary file not shown.
Before Width: | Height: | Size: 186 KiB |
0
util/__init__.py
Normal file
0
util/__init__.py
Normal file
80
util/audio/__init__.py
Normal file
80
util/audio/__init__.py
Normal file
@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
|
||||
from python_speech_features import mfcc
|
||||
|
||||
def audiofiles_to_audio_data_sets(audio_filenames, numcep, numcontext):
|
||||
# Define audio_data_sets to return
|
||||
inputs = []
|
||||
input_seq_lens = []
|
||||
|
||||
# Loop over audio_filenames
|
||||
for audio_filename in audio_filenames:
|
||||
# Load wav files
|
||||
fs, audio = wav.read(audio_filename)
|
||||
|
||||
# Get mfcc coefficients
|
||||
orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
|
||||
|
||||
# For each time slice of the training set, we need to copy the context this makes
|
||||
# the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
|
||||
# because of:
|
||||
# - numcep dimensions for the current mfcc feature set
|
||||
# - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
|
||||
# => so numcep + 2*numcontext*numcep
|
||||
train_inputs = np.array([], np.float)
|
||||
train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
|
||||
|
||||
# Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
|
||||
empty_mfcc = np.array([])
|
||||
empty_mfcc.resize((numcep))
|
||||
|
||||
# Prepare train_inputs with past and future contexts
|
||||
time_slices = range(train_inputs.shape[0])
|
||||
context_past_min = time_slices[0] + numcontext
|
||||
context_future_max = time_slices[-1] - numcontext
|
||||
for time_slice in time_slices:
|
||||
### Reminder: array[start:stop:step]
|
||||
### slices from indice |start| up to |stop| (not included), every |step|
|
||||
# Pick up to numcontext time slices in the past, and complete with empty
|
||||
# mfcc features
|
||||
need_empty_past = max(0, (context_past_min - time_slice))
|
||||
empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
|
||||
data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
|
||||
assert(len(empty_source_past) + len(data_source_past) == numcontext)
|
||||
|
||||
# Pick up to numcontext time slices in the future, and complete with empty
|
||||
# mfcc features
|
||||
need_empty_future = max(0, (time_slice - context_future_max))
|
||||
empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
|
||||
data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
|
||||
assert(len(empty_source_future) + len(data_source_future) == numcontext)
|
||||
|
||||
if need_empty_past:
|
||||
past = np.concatenate((empty_source_past, data_source_past))
|
||||
else:
|
||||
past = data_source_past
|
||||
|
||||
if need_empty_future:
|
||||
future = np.concatenate((data_source_future, empty_source_future))
|
||||
else:
|
||||
future = data_source_future
|
||||
|
||||
past = np.reshape(past, numcontext*numcep)
|
||||
now = orig_inputs[time_slice]
|
||||
future = np.reshape(future, numcontext*numcep)
|
||||
|
||||
train_inputs[time_slice] = np.concatenate((past, now, future))
|
||||
assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
|
||||
|
||||
# Whiten inputs (TODO: Should we whiten)
|
||||
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
|
||||
|
||||
# Obtain array of sequence lengths
|
||||
input_seq_lens.append(train_inputs.shape[0])
|
||||
|
||||
# Convert train_inputs to proper form
|
||||
inputs.append(train_inputs)
|
||||
|
||||
# Return results
|
||||
return (np.asarray(inputs), input_seq_lens)
|
0
util/importers/__init__.py
Normal file
0
util/importers/__init__.py
Normal file
86
util/importers/ted_lium/__init__.py
Normal file
86
util/importers/ted_lium/__init__.py
Normal file
@ -0,0 +1,86 @@
|
||||
import numpy as np
|
||||
|
||||
from os import path
|
||||
from util.text import text_to_sparse_tuple
|
||||
from util.audio import audiofiles_to_audio_data_sets
|
||||
|
||||
class DataSets(object):
|
||||
def __init__(self, train, validation, test):
|
||||
self._train = train
|
||||
self._validation = validation
|
||||
self._test = test
|
||||
|
||||
@property
|
||||
def train(self):
|
||||
return self._train
|
||||
|
||||
@property
|
||||
def validation(self):
|
||||
return self._validation
|
||||
|
||||
@property
|
||||
def test(self):
|
||||
return self._test
|
||||
|
||||
class DataSet(object):
|
||||
def __init__(self, inputs, outputs, seq_len):
|
||||
self._offset = 0
|
||||
self._inputs = inputs
|
||||
self._outputs = outputs
|
||||
self._seq_len = seq_len
|
||||
|
||||
def next_batch(self, batch_size):
|
||||
next_batch = (self._inputs, self._outputs, self._seq_len) # TODO: Choose only batch_size elements
|
||||
self._offset += batch_size
|
||||
return next_batch
|
||||
|
||||
@property
|
||||
def max_batch_seq_len(self):
|
||||
return np.amax(self._seq_len)
|
||||
|
||||
@property
|
||||
def num_examples(self):
|
||||
return self._inputs.shape[0]
|
||||
|
||||
|
||||
def read_data_sets(data_dir, numcep, numcontext):
|
||||
# Get train data
|
||||
train_outputs = read_text_data_sets(data_dir, 'train')
|
||||
train_inputs, train_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'train')
|
||||
# Get validation data
|
||||
validation_outputs = read_text_data_sets(data_dir, 'validation')
|
||||
validation_inputs, validation_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'validation')
|
||||
# Get test data
|
||||
test_outputs = read_text_data_sets(data_dir, 'test')
|
||||
test_inputs, test_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'test')
|
||||
|
||||
# Create train, validation, and test DataSet's
|
||||
train = DataSet(inputs=train_inputs, outputs=train_outputs, seq_len=train_seq_len)
|
||||
validation = DataSet(inputs=validation_inputs, outputs=validation_outputs, seq_len=validation_seq_len)
|
||||
test = DataSet(inputs=test_inputs, outputs=test_outputs, seq_len=test_seq_len)
|
||||
|
||||
# Return DataSets
|
||||
return DataSets(train=train, validation=validation, test=test)
|
||||
|
||||
|
||||
def read_text_data_sets(data_dir, data_type):
|
||||
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
|
||||
|
||||
# Create file names
|
||||
text_filename = path.join(data_dir, 'LDC93S1.txt')
|
||||
|
||||
# Read text file and create list of sentence's words w/spaces replaced by ''
|
||||
with open(text_filename, 'rb') as f:
|
||||
for line in f.readlines():
|
||||
original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
|
||||
|
||||
return text_to_sparse_tuple([original])
|
||||
|
||||
def read_audio_data_sets(data_dir, numcep, numcontext, data_type):
|
||||
# TODO: Do not ignore data_type = ['train'|'validation'|'test']
|
||||
|
||||
# Create file name
|
||||
audio_filename = path.join(data_dir, 'LDC93S1.wav')
|
||||
|
||||
# Return properly formatted data
|
||||
return audiofiles_to_audio_data_sets([audio_filename], numcep, numcontext)
|
50
util/text/__init__.py
Normal file
50
util/text/__init__.py
Normal file
@ -0,0 +1,50 @@
|
||||
import numpy as np
|
||||
|
||||
# Constants
|
||||
SPACE_TOKEN = '<space>'
|
||||
SPACE_INDEX = 0
|
||||
FIRST_INDEX = ord('a') - 1 # 0 is reserved to space
|
||||
|
||||
|
||||
def text_to_sparse_tuple(originals):
|
||||
# Define list to hold results
|
||||
results = []
|
||||
|
||||
# Process each original in originals
|
||||
for original in originals:
|
||||
# Create list of sentence's words w/spaces replaced by ''
|
||||
result = original.replace(' ', ' ')
|
||||
result = result.split(' ')
|
||||
|
||||
# Tokenize words into letters adding in SPACE_TOKEN where required
|
||||
result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
|
||||
|
||||
# Map characters into indicies
|
||||
result = np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
|
||||
|
||||
# Add result to results
|
||||
results.append(result)
|
||||
|
||||
# Creating sparse representation to feed the placeholder
|
||||
return sparse_tuple_from(results)
|
||||
|
||||
|
||||
def sparse_tuple_from(sequences, dtype=np.int32):
|
||||
"""Create a sparse representention of x.
|
||||
Args:
|
||||
sequences: a list of lists of type dtype where each element is a sequence
|
||||
Returns:
|
||||
A tuple with (indices, values, shape)
|
||||
"""
|
||||
indices = []
|
||||
values = []
|
||||
|
||||
for n, seq in enumerate(sequences):
|
||||
indices.extend(zip([n]*len(seq), xrange(len(seq))))
|
||||
values.extend(seq)
|
||||
|
||||
indices = np.asarray(indices, dtype=np.int64)
|
||||
values = np.asarray(values, dtype=dtype)
|
||||
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
|
||||
|
||||
return indices, values, shape
|
Loading…
Reference in New Issue
Block a user