Adding CTC to notebook

2016-09-18 20:18:35 +02:00 · 2016-09-18 20:18:35 +02:00 · 9eebe98aa9
commit 9eebe98aa9
parent 311da0e80c
14 changed files with 5451 additions and 957 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.ipynb_checkpoints
+*.pyc
--- a/DeepSpeech.ipynb
+++ b/DeepSpeech.ipynb
--- a/data/smoke_test/LDC93S1.txt
+++ b/data/smoke_test/LDC93S1.txt
@ -0,0 +1 @@
+0 46797 She had your dark suit in greasy wash water all year.
--- a/data/smoke_test/LDC93S1.wav
+++ b/data/smoke_test/LDC93S1.wav
--- a/images/Figure.png
+++ b/images/Figure.png
--- a/images/Lattice.png
+++ b/images/Lattice.png
--- a/images/PrefixDecoding.png
+++ b/images/PrefixDecoding.png
--- a/images/PrefixSearchDecoding.png
+++ b/images/PrefixSearchDecoding.png
--- a/images/TokenPassing.png
+++ b/images/TokenPassing.png
--- a/util/init.py
+++ b/util/init.py
--- a/util/audio/init.py
+++ b/util/audio/init.py
@ -0,0 +1,80 @@
+import numpy as np
+import scipy.io.wavfile as wav
+
+from python_speech_features import mfcc
+
+def audiofiles_to_audio_data_sets(audio_filenames, numcep, numcontext):
+    # Define audio_data_sets to return 
+    inputs = []
+    input_seq_lens = []
+     
+    # Loop over audio_filenames
+    for audio_filename in audio_filenames:
+        # Load wav files
+        fs, audio = wav.read(audio_filename)
+         
+        # Get mfcc coefficients
+        orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
+         
+        # For each time slice of the training set, we need to copy the context this makes
+        # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
+        # because of:
+        #  - numcep dimensions for the current mfcc feature set
+        #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
+        # => so numcep + 2*numcontext*numcep
+        train_inputs = np.array([], np.float)
+        train_inputs.resize((orig_inputs.shape[0], numcep + 2*numcep*numcontext))
+         
+        # Prepare pre-fix post fix context (TODO: Fill empty_mfcc with MCFF of silence)
+        empty_mfcc = np.array([])
+        empty_mfcc.resize((numcep))
+         
+        # Prepare train_inputs with past and future contexts
+        time_slices = range(train_inputs.shape[0])
+        context_past_min   = time_slices[0]  + numcontext 
+        context_future_max = time_slices[-1] - numcontext 
+        for time_slice in time_slices:
+            ### Reminder: array[start:stop:step]
+            ### slices from indice |start| up to |stop| (not included), every |step|
+            # Pick up to numcontext time slices in the past, and complete with empty
+            # mfcc features
+            need_empty_past     = max(0, (context_past_min - time_slice))
+            empty_source_past   = list(empty_mfcc for empty_slots in range(need_empty_past))
+            data_source_past    = orig_inputs[max(0, time_slice - numcontext):time_slice]
+            assert(len(empty_source_past) + len(data_source_past) == numcontext)
+             
+            # Pick up to numcontext time slices in the future, and complete with empty
+            # mfcc features
+            need_empty_future   = max(0, (time_slice - context_future_max))
+            empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
+            data_source_future  = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
+            assert(len(empty_source_future) + len(data_source_future) == numcontext)
+             
+            if need_empty_past:
+                past   = np.concatenate((empty_source_past, data_source_past))
+            else:
+                past   = data_source_past
+             
+            if need_empty_future:
+                future = np.concatenate((data_source_future, empty_source_future))
+            else:
+                future = data_source_future
+             
+            past   = np.reshape(past, numcontext*numcep)
+            now    = orig_inputs[time_slice]
+            future = np.reshape(future, numcontext*numcep)
+             
+            train_inputs[time_slice] = np.concatenate((past, now, future))
+            assert(len(train_inputs[time_slice]) == numcep + 2*numcep*numcontext)
+        
+        # Whiten inputs (TODO: Should we whiten)
+        train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
+         
+        # Obtain array of sequence lengths
+        input_seq_lens.append(train_inputs.shape[0])
+         
+        # Convert train_inputs to proper form
+        inputs.append(train_inputs)
+        
+    # Return results
+    return (np.asarray(inputs), input_seq_lens)
--- a/util/importers/init.py
+++ b/util/importers/init.py
--- a/util/importers/ted_lium/init.py
+++ b/util/importers/ted_lium/init.py
@ -0,0 +1,86 @@
+import numpy as np
+
+from os import path
+from util.text import text_to_sparse_tuple 
+from util.audio import audiofiles_to_audio_data_sets
+
+class DataSets(object):
+    def __init__(self, train, validation, test):
+        self._train = train
+        self._validation = validation
+        self._test = test
+
+    @property
+    def train(self):
+        return self._train
+
+    @property
+    def validation(self):
+        return self._validation
+
+    @property
+    def test(self):
+        return self._test
+
+class DataSet(object):
+    def __init__(self, inputs, outputs, seq_len):
+        self._offset = 0
+        self._inputs = inputs
+        self._outputs = outputs
+        self._seq_len = seq_len
+
+    def next_batch(self, batch_size):
+        next_batch = (self._inputs, self._outputs, self._seq_len) # TODO: Choose only batch_size elements
+        self._offset += batch_size
+        return next_batch
+
+    @property
+    def max_batch_seq_len(self):
+        return np.amax(self._seq_len)
+
+    @property
+    def num_examples(self):
+        return self._inputs.shape[0]
+
+
+def read_data_sets(data_dir, numcep, numcontext):
+    # Get train data
+    train_outputs = read_text_data_sets(data_dir, 'train')
+    train_inputs, train_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'train')
+    # Get validation data
+    validation_outputs = read_text_data_sets(data_dir, 'validation')
+    validation_inputs, validation_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'validation')
+    # Get test data
+    test_outputs = read_text_data_sets(data_dir, 'test')
+    test_inputs, test_seq_len = read_audio_data_sets(data_dir, numcep, numcontext, 'test')
+    
+    # Create train, validation, and test DataSet's
+    train = DataSet(inputs=train_inputs, outputs=train_outputs, seq_len=train_seq_len)
+    validation = DataSet(inputs=validation_inputs, outputs=validation_outputs, seq_len=validation_seq_len)
+    test = DataSet(inputs=test_inputs, outputs=test_outputs, seq_len=test_seq_len)
+     
+    # Return DataSets
+    return DataSets(train=train, validation=validation, test=test)
+    
+
+def read_text_data_sets(data_dir, data_type):
+    # TODO: Do not ignore data_type = ['train'|'validation'|'test']
+    
+    # Create file names
+    text_filename = path.join(data_dir, 'LDC93S1.txt') 
+
+    # Read text file and create list of sentence's words w/spaces replaced by ''
+    with open(text_filename, 'rb') as f:
+        for line in f.readlines():
+            original = ' '.join(line.strip().lower().split(' ')[2:]).replace('.', '')
+    
+    return text_to_sparse_tuple([original])
+
+def read_audio_data_sets(data_dir, numcep, numcontext, data_type):
+    # TODO: Do not ignore data_type = ['train'|'validation'|'test']
+     
+    # Create file name
+    audio_filename = path.join(data_dir, 'LDC93S1.wav') 
+
+    # Return properly formatted data
+    return audiofiles_to_audio_data_sets([audio_filename], numcep, numcontext)
--- a/util/text/init.py
+++ b/util/text/init.py
@ -0,0 +1,50 @@
+import numpy as np
+
+# Constants
+SPACE_TOKEN = '<space>'
+SPACE_INDEX = 0
+FIRST_INDEX = ord('a') - 1  # 0 is reserved to space
+
+
+def text_to_sparse_tuple(originals):
+    # Define list to hold results
+    results = []
+
+    # Process each original in originals
+    for original in originals:
+        # Create list of sentence's words w/spaces replaced by ''
+        result = original.replace(' ', '  ')
+        result = result.split(' ')
+        
+        # Tokenize words into letters adding in SPACE_TOKEN where required
+        result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
+         
+        # Map characters into indicies
+        result = np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
+
+        # Add result to results
+        results.append(result)
+
+    # Creating sparse representation to feed the placeholder
+    return sparse_tuple_from(results)
+
+
+def sparse_tuple_from(sequences, dtype=np.int32):
+    """Create a sparse representention of x.
+    Args:
+        sequences: a list of lists of type dtype where each element is a sequence
+    Returns:
+        A tuple with (indices, values, shape)
+    """
+    indices = []
+    values = []
+
+    for n, seq in enumerate(sequences):
+        indices.extend(zip([n]*len(seq), xrange(len(seq))))
+        values.extend(seq)
+
+    indices = np.asarray(indices, dtype=np.int64)
+    values = np.asarray(values, dtype=dtype)
+    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
+
+    return indices, values, shape
				`@ -0,0 +1 @@`
				`0 46797 She had your dark suit in greasy wash water all year.`