Revert to a pipelined approach for test epochs to avoid CPU OOM with large alphabets

2019-05-11 08:15:28 -03:00 · 2019-05-11 08:15:28 -03:00 · 699e4ebcd7
commit 699e4ebcd7
parent a4b35d2f24
4 changed files with 49 additions and 69 deletions
--- a/evaluate.py
+++ b/evaluate.py
@ -19,7 +19,6 @@ from util.evaluate_tools import calculate_report
 from util.feeding import create_dataset
 from util.flags import create_flags, FLAGS
 from util.logging import log_error, log_progress, create_progressbar
-from util.text import levenshtein


 def sparse_tensor_value_to_texts(value, alphabet):
@ -88,14 +87,13 @@ def evaluate(test_csvs, create_model, try_loading):
            exit(1)

        def run_test(init_op, dataset):
-            logitses = []
            losses = []
-            seq_lengths = []
+            predictions = []
            ground_truths = []

-            bar = create_progressbar(prefix='Computing acoustic model predictions | ',
+            bar = create_progressbar(prefix='Test epoch | ',
                                     widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start()
-            log_progress('Computing acoustic model predictions...')
+            log_progress('Test epoch...')

            step_count = 0

@ -105,35 +103,23 @@ def evaluate(test_csvs, create_model, try_loading):
            # First pass, compute losses and transposed logits for decoding
            while True:
                try:
-                    logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y])
+                    batch_logits, batch_loss, batch_lengths, batch_transcripts = \
+                        session.run([transposed, loss, batch_x_len, batch_y])
                except tf.errors.OutOfRangeError:
                    break

+                decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
+                                                        num_processes=num_processes, scorer=scorer)
+                predictions.extend(d[0][1] for d in decoded)
+                ground_truths.extend(sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet))
+                losses.extend(batch_loss)
+
                step_count += 1
                bar.update(step_count)

-                logitses.append(logits)
-                losses.extend(loss_)
-                seq_lengths.append(lengths)
-                ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet))
-
            bar.finish()

-            predictions = []
-
-            bar = create_progressbar(max_value=step_count,
-                                     prefix='Decoding predictions | ').start()
-            log_progress('Decoding predictions...')
-
-            # Second pass, decode logits and compute WER and edit distance metrics
-            for logits, seq_length in bar(zip(logitses, seq_lengths)):
-                decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width,
-                                                        num_processes=num_processes, scorer=scorer)
-                predictions.extend(d[0][1] for d in decoded)
-
-            distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
-
-            wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
+            wer, cer, samples = calculate_report(ground_truths, predictions, losses)
            mean_loss = np.mean(losses)

            # Take only the first report_count items
@ -144,7 +130,7 @@ def evaluate(test_csvs, create_model, try_loading):
            print('-' * 80)
            for sample in report_samples:
                print('WER: %f, CER: %f, loss: %f' %
-                      (sample.wer, sample.distance, sample.loss))
+                      (sample.wer, sample.cer, sample.loss))
                print(' - src: "%s"' % sample.src)
                print(' - res: "%s"' % sample.res)
                print('-' * 80)
--- a/evaluate_tflite.py
+++ b/evaluate_tflite.py
@ -13,7 +13,6 @@ from six.moves import zip, range
 from multiprocessing import JoinableQueue, Pool, Process, Queue, cpu_count
 from deepspeech import Model

-from util.text import levenshtein
 from util.evaluate_tools import process_decode_result, calculate_report

 r'''
@ -96,9 +95,7 @@ def main():
        ground_truths.append(msg['ground_truth'])
        predictions.append(msg['prediction'])

-    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
-
-    wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
+    wer, cer, samples = calculate_report(ground_truths, predictions, losses)
    mean_loss = np.mean(losses)

    print('Test - WER: %f, CER: %f, loss: %f' %
--- a/util/evaluate_tools.py
+++ b/util/evaluate_tools.py
@ -6,7 +6,8 @@ from multiprocessing.dummy import Pool

 from attrdict import AttrDict

-from util.text import wer_cer_batch, levenshtein
+from util.text import levenshtein
+

 def pmap(fun, iterable):
    pool = Pool()
@ -14,29 +15,53 @@ def pmap(fun, iterable):
    pool.close()
    return results

+
+def wer_cer_batch(samples):
+    r"""
+    The WER is defined as the edit/Levenshtein distance on word level divided by
+    the amount of words in the original text.
+    In case of the original having more words (N) than the result and both
+    being totally different (all N words resulting in 1 edit operation each),
+    the WER will always be 1 (N / N = 1).
+    """
+    wer = sum(s.word_distance for s in samples) / sum(s.word_length for s in samples)
+    cer = sum(s.char_distance for s in samples) / sum(s.char_length for s in samples)
+
+    wer = min(wer, 1.0)
+    cer = min(cer, 1.0)
+
+    return wer, cer
+
+
 def process_decode_result(item):
-    label, decoding, distance, loss = item
-    word_distance = levenshtein(label.split(), decoding.split())
-    word_length = float(len(label.split()))
+    ground_truth, prediction, loss = item
+    char_distance = levenshtein(ground_truth, prediction)
+    char_length = len(ground_truth)
+    word_distance = levenshtein(ground_truth.split(), prediction.split())
+    word_length = len(ground_truth.split())
    return AttrDict({
-        'src': label,
-        'res': decoding,
+        'src': ground_truth,
+        'res': prediction,
        'loss': loss,
-        'distance': distance,
+        'char_distance': char_distance,
+        'char_length': char_length,
+        'word_distance': word_distance,
+        'word_length': word_length,
+        'cer': char_distance / char_length,
        'wer': word_distance / word_length,
    })


-def calculate_report(labels, decodings, distances, losses):
+def calculate_report(labels, decodings, losses):
    r'''
    This routine will calculate a WER report.
    It'll compute the `mean` WER and create ``Sample`` objects of the ``report_count`` top lowest
    loss items from the provided WER results tuple (only items with WER!=0 and ordered by their WER).
    '''
-    samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
+    samples = pmap(process_decode_result, zip(labels, decodings, losses))

    # Getting the WER and CER from the accumulated edit distances and lengths
-    samples_wer, samples_cer = wer_cer_batch(labels, decodings)
+    samples_wer, samples_cer = wer_cer_batch(samples)

    # Order the remaining items by their loss (lowest loss on top)
    samples.sort(key=lambda s: s.loss)
--- a/util/text.py
+++ b/util/text.py
@ -55,34 +55,6 @@ def text_to_char_array(original, alphabet):
    return np.asarray([alphabet.label_from_string(c) for c in original])


-def wer_cer_batch(originals, results):
-    r"""
-    The WER is defined as the editing/Levenshtein distance on word level
-    divided by the amount of words in the original text.
-    In case of the original having more words (N) than the result and both
-    being totally different (all N words resulting in 1 edit operation each),
-    the WER will always be 1 (N / N = 1).
-    """
-    # The WER is calculated on word (and NOT on character) level.
-    # Therefore we split the strings into words first
-    assert len(originals) == len(results)
-
-    total_cer = 0.0
-    total_char_length = 0.0
-
-    total_wer = 0.0
-    total_word_length = 0.0
-
-    for original, result in zip(originals, results):
-        total_cer += levenshtein(original, result)
-        total_char_length += len(original)
-
-        total_wer += levenshtein(original.split(), result.split())
-        total_word_length += len(original.split())
-
-    return total_wer / total_word_length, total_cer / total_char_length
-
-
 # The following code is from: http://hetland.org/coding/python/levenshtein.py

 # This is a straightforward implementation of a well-known algorithm, and thus