Adding CTC to notebook

2016-09-18 20:18:35 +02:00 · 2016-09-18 20:18:35 +02:00 · 9eebe98aa9
commit 9eebe98aa9
parent 311da0e80c
14 changed files with 5451 additions and 957 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 .ipynb_checkpoints
 *.pyc
--- a/DeepSpeech.ipynb
+++ b/DeepSpeech.ipynb
--- a/data/smoke_test/LDC93S1.txt
+++ b/data/smoke_test/LDC93S1.txt
@ -0,0 +1 @@
 0 46797 She had your dark suit in greasy wash water all year.
--- a/data/smoke_test/LDC93S1.wav
+++ b/data/smoke_test/LDC93S1.wav
--- a/images/Figure.png
+++ b/images/Figure.png
--- a/images/Lattice.png
+++ b/images/Lattice.png
--- a/images/PrefixDecoding.png
+++ b/images/PrefixDecoding.png
--- a/images/PrefixSearchDecoding.png
+++ b/images/PrefixSearchDecoding.png
--- a/images/TokenPassing.png
+++ b/images/TokenPassing.png
--- a/util/init.py
+++ b/util/init.py
--- a/util/audio/init.py
+++ b/util/audio/init.py
@ -0,0 +1,80 @@
 import numpy as np
 import scipy.io.wavfile as wav
 from python_speech_features import mfcc
 def audiofiles_to_audio_data_sets(audio_filenames, numcep, numcontext):
    # Define audio_data_sets to return 
    inputs = []
    input_seq_lens = []
    # Loop over audio_filenames
    for audio_filename in audio_filenames:
        # Load wav files
        fs, audio = wav.read(audio_filename)
        # Get mfcc coefficients
        orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
        # For each time slice of the training set, we need to copy the context this makes
        # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
        # because of:
        #  - numcep dimensions for the current mfcc feature set
        #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
        # => so numcep + 2*numcontext*numcep
        train_inputs = np.array([], np.float)
        train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
        # Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
        empty_mfcc = np.array([])
        empty_mfcc.resize((numcep))
        # Prepare train_inputs with past and future contexts
        time_slices = range(train_inputs.shape[0])
        context_past_min   = time_slices[0]  + numcontext 
        context_future_max = time_slices[-1] - numcontext 
        for time_slice in time_slices:
            ### Reminder: array[start:stop:step]
            ### slices from indice |start| up to |stop| (not included), every |step|
            # Pick up to numcontext time slices in the past, and complete with empty
            # mfcc features
            need_empty_past     = max(0, (context_past_min - time_slice))
            empty_source_past   = list(empty_mfcc for empty_slots in range(need_empty_past))
            data_source_past    = orig_inputs[max(0, time_slice - numcontext):time_slice]
            assert(len(empty_source_past) + len(data_source_past) == numcontext)
            # Pick up to numcontext time slices in the future, and complete with empty
            # mfcc features
            need_empty_future   = max(0, (time_slice - context_future_max))
            empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
            data_source_future  = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
            assert(len(empty_source_future) + len(data_source_future) == numcontext)
            if need_empty_past:
                past   = np.concatenate((empty_source_past, data_source_past))
            else:
                past   = data_source_past
            if need_empty_future:
                future = np.concatenate((data_source_future, empty_source_future))
            else:
                future = data_source_future
            past   = np.reshape(past, numcontext*numcep)
            now    = orig_inputs[time_slice]
            future = np.reshape(future, numcontext*numcep)
            train_inputs[time_slice] = np.concatenate((past, now, future))
            assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
        # Whiten inputs (TODO: Should we whiten)
        train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
        # Obtain array of sequence lengths
        input_seq_lens.append(train_inputs.shape[0])
        # Convert train_inputs to proper form
        inputs.append(train_inputs)
    # Return results
    return (np.asarray(inputs), input_seq_lens)
--- a/util/importers/init.py
+++ b/util/importers/init.py
--- a/util/importers/ted_lium/init.py
+++ b/util/importers/ted_lium/init.py
@ -0,0 +1,86 @@
 import numpy as np
 from os import path
 from util.text import text_to_sparse_tuple 
 from util.audio import audiofiles_to_audio_data_sets
 class DataSets(object):
    def __init__(self, train, validation, test):
        self._train = train
        self._validation = validation
        self._test = test
    @property
    def train(self):
        return self._train
    @property
    def validation(self):
        return self._validation
    @property
    def test(self):
        return self._test
 class DataSet(object):
    def __init__(self, inputs, outputs, seq_len):
        self._offset = 0
        self._inputs = inputs
        self._outputs = outputs
        self._seq_len = seq_len
    def next_batch(self, batch_size):
        next_batch = (self._inputs, self._outputs, self._seq_len) # TODO: Choose only batch_size elements
        self._offset += batch_size
        return next_batch
    @property
    def max_batch_seq_len(self):
        return np.amax(self._seq_len)
    @property
    def num_examples(self):
        return self._inputs.shape[0]
 def read_data_sets(data_dir, numcep, numcontext):
    # Get train data
    train_outputs = read_text_data_sets(data_dir, 'train')
    train_inputs, train_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'train')
    # Get validation data
    validation_outputs = read_text_data_sets(data_dir, 'validation')
    validation_inputs, validation_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'validation')
    # Get test data
    test_outputs = read_text_data_sets(data_dir, 'test')
    test_inputs, test_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'test')
    # Create train, validation, and test DataSet's
    train = DataSet(inputs=train_inputs, outputs=train_outputs, seq_len=train_seq_len)
    validation = DataSet(inputs=validation_inputs, outputs=validation_outputs, seq_len=validation_seq_len)
    test = DataSet(inputs=test_inputs, outputs=test_outputs, seq_len=test_seq_len)
    # Return DataSets
    return DataSets(train=train, validation=validation, test=test)
 def read_text_data_sets(data_dir, data_type):
    # TODO: Do not ignore data_type = ['train'|'validation'|'test']
    # Create file names
    text_filename = path.join(data_dir, 'LDC93S1.txt') 
    # Read text file and create list of sentence's words w/spaces replaced by ''
    with open(text_filename, 'rb') as f:
        for line in f.readlines():
            original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
    return text_to_sparse_tuple([original])
 def read_audio_data_sets(data_dir, numcep, numcontext, data_type):
    # TODO: Do not ignore data_type = ['train'|'validation'|'test']
    # Create file name
    audio_filename = path.join(data_dir, 'LDC93S1.wav') 
    # Return properly formatted data
    return audiofiles_to_audio_data_sets([audio_filename], numcep, numcontext)
--- a/util/text/init.py
+++ b/util/text/init.py
@ -0,0 +1,50 @@
 import numpy as np
 # Constants
 SPACE_TOKEN = '<space>'
 SPACE_INDEX = 0
 FIRST_INDEX = ord('a') - 1  # 0 is reserved to space
 def text_to_sparse_tuple(originals):
    # Define list to hold results
    results = []
    # Process each original in originals
    for original in originals:
        # Create list of sentence's words w/spaces replaced by ''
        result = original.replace(' ', '  ')
        result = result.split(' ')
        # Tokenize words into letters adding in SPACE_TOKEN where required
        result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
        # Map characters into indicies
        result = np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
        # Add result to results
        results.append(result)
    # Creating sparse representation to feed the placeholder
    return sparse_tuple_from(results)
 def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []
    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), xrange(len(seq))))
        values.extend(seq)
    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
    return indices, values, shape
		`@ -0,0 +1 @@`
							`0 46797 She had your dark suit in greasy wash water all year.`