From 699e4ebcd7b988d76428057db2775a461cb8b962 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Sat, 11 May 2019 08:15:28 -0300
Subject: [PATCH] Revert to a pipelined approach for test epochs to avoid CPU
 OOM with large alphabets

---
 evaluate.py            | 40 ++++++++++++-------------------------
 evaluate_tflite.py     |  5 +----
 util/evaluate_tools.py | 45 ++++++++++++++++++++++++++++++++----------
 util/text.py           | 28 --------------------------
 4 files changed, 49 insertions(+), 69 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 2dc767f8..6e7033fe 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -19,7 +19,6 @@ from util.evaluate_tools import calculate_report
 from util.feeding import create_dataset
 from util.flags import create_flags, FLAGS
 from util.logging import log_error, log_progress, create_progressbar
-from util.text import levenshtein
 
 
 def sparse_tensor_value_to_texts(value, alphabet):
@@ -88,14 +87,13 @@ def evaluate(test_csvs, create_model, try_loading):
             exit(1)
 
         def run_test(init_op, dataset):
-            logitses = []
             losses = []
-            seq_lengths = []
+            predictions = []
             ground_truths = []
 
-            bar = create_progressbar(prefix='Computing acoustic model predictions | ',
+            bar = create_progressbar(prefix='Test epoch | ',
                                      widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start()
-            log_progress('Computing acoustic model predictions...')
+            log_progress('Test epoch...')
 
             step_count = 0
 
@@ -105,35 +103,23 @@ def evaluate(test_csvs, create_model, try_loading):
             # First pass, compute losses and transposed logits for decoding
             while True:
                 try:
-                    logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y])
+                    batch_logits, batch_loss, batch_lengths, batch_transcripts = \
+                        session.run([transposed, loss, batch_x_len, batch_y])
                 except tf.errors.OutOfRangeError:
                     break
 
+                decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
+                                                        num_processes=num_processes, scorer=scorer)
+                predictions.extend(d[0][1] for d in decoded)
+                ground_truths.extend(sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet))
+                losses.extend(batch_loss)
+
                 step_count += 1
                 bar.update(step_count)
 
-                logitses.append(logits)
-                losses.extend(loss_)
-                seq_lengths.append(lengths)
-                ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet))
-
             bar.finish()
 
-            predictions = []
-
-            bar = create_progressbar(max_value=step_count,
-                                     prefix='Decoding predictions | ').start()
-            log_progress('Decoding predictions...')
-
-            # Second pass, decode logits and compute WER and edit distance metrics
-            for logits, seq_length in bar(zip(logitses, seq_lengths)):
-                decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width,
-                                                        num_processes=num_processes, scorer=scorer)
-                predictions.extend(d[0][1] for d in decoded)
-
-            distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
-
-            wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
+            wer, cer, samples = calculate_report(ground_truths, predictions, losses)
             mean_loss = np.mean(losses)
 
             # Take only the first report_count items
@@ -144,7 +130,7 @@ def evaluate(test_csvs, create_model, try_loading):
             print('-' * 80)
             for sample in report_samples:
                 print('WER: %f, CER: %f, loss: %f' %
-                      (sample.wer, sample.distance, sample.loss))
+                      (sample.wer, sample.cer, sample.loss))
                 print(' - src: "%s"' % sample.src)
                 print(' - res: "%s"' % sample.res)
                 print('-' * 80)
diff --git a/evaluate_tflite.py b/evaluate_tflite.py
index 23836dba..12aaf03d 100644
--- a/evaluate_tflite.py
+++ b/evaluate_tflite.py
@@ -13,7 +13,6 @@ from six.moves import zip, range
 from multiprocessing import JoinableQueue, Pool, Process, Queue, cpu_count
 from deepspeech import Model
 
-from util.text import levenshtein
 from util.evaluate_tools import process_decode_result, calculate_report
 
 r'''
@@ -96,9 +95,7 @@ def main():
         ground_truths.append(msg['ground_truth'])
         predictions.append(msg['prediction'])
 
-    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
-
-    wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
+    wer, cer, samples = calculate_report(ground_truths, predictions, losses)
     mean_loss = np.mean(losses)
 
     print('Test - WER: %f, CER: %f, loss: %f' %
diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py
index d3939294..1ad91f46 100644
--- a/util/evaluate_tools.py
+++ b/util/evaluate_tools.py
@@ -6,7 +6,8 @@ from multiprocessing.dummy import Pool
 
 from attrdict import AttrDict
 
-from util.text import wer_cer_batch, levenshtein
+from util.text import levenshtein
+
 
 def pmap(fun, iterable):
     pool = Pool()
@@ -14,29 +15,53 @@ def pmap(fun, iterable):
     pool.close()
     return results
 
+
+def wer_cer_batch(samples):
+    r"""
+    The WER is defined as the edit/Levenshtein distance on word level divided by
+    the amount of words in the original text.
+    In case of the original having more words (N) than the result and both
+    being totally different (all N words resulting in 1 edit operation each),
+    the WER will always be 1 (N / N = 1).
+    """
+    wer = sum(s.word_distance for s in samples) / sum(s.word_length for s in samples)
+    cer = sum(s.char_distance for s in samples) / sum(s.char_length for s in samples)
+
+    wer = min(wer, 1.0)
+    cer = min(cer, 1.0)
+
+    return wer, cer
+
+
 def process_decode_result(item):
-    label, decoding, distance, loss = item
-    word_distance = levenshtein(label.split(), decoding.split())
-    word_length = float(len(label.split()))
+    ground_truth, prediction, loss = item
+    char_distance = levenshtein(ground_truth, prediction)
+    char_length = len(ground_truth)
+    word_distance = levenshtein(ground_truth.split(), prediction.split())
+    word_length = len(ground_truth.split())
     return AttrDict({
-        'src': label,
-        'res': decoding,
+        'src': ground_truth,
+        'res': prediction,
         'loss': loss,
-        'distance': distance,
+        'char_distance': char_distance,
+        'char_length': char_length,
+        'word_distance': word_distance,
+        'word_length': word_length,
+        'cer': char_distance / char_length,
         'wer': word_distance / word_length,
     })
 
 
-def calculate_report(labels, decodings, distances, losses):
+def calculate_report(labels, decodings, losses):
     r'''
     This routine will calculate a WER report.
     It'll compute the `mean` WER and create ``Sample`` objects of the ``report_count`` top lowest
     loss items from the provided WER results tuple (only items with WER!=0 and ordered by their WER).
     '''
-    samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
+    samples = pmap(process_decode_result, zip(labels, decodings, losses))
 
     # Getting the WER and CER from the accumulated edit distances and lengths
-    samples_wer, samples_cer = wer_cer_batch(labels, decodings)
+    samples_wer, samples_cer = wer_cer_batch(samples)
 
     # Order the remaining items by their loss (lowest loss on top)
     samples.sort(key=lambda s: s.loss)
diff --git a/util/text.py b/util/text.py
index 207c79ea..7ae6ef3e 100644
--- a/util/text.py
+++ b/util/text.py
@@ -55,34 +55,6 @@ def text_to_char_array(original, alphabet):
     return np.asarray([alphabet.label_from_string(c) for c in original])
 
 
-def wer_cer_batch(originals, results):
-    r"""
-    The WER is defined as the editing/Levenshtein distance on word level
-    divided by the amount of words in the original text.
-    In case of the original having more words (N) than the result and both
-    being totally different (all N words resulting in 1 edit operation each),
-    the WER will always be 1 (N / N = 1).
-    """
-    # The WER is calculated on word (and NOT on character) level.
-    # Therefore we split the strings into words first
-    assert len(originals) == len(results)
-
-    total_cer = 0.0
-    total_char_length = 0.0
-
-    total_wer = 0.0
-    total_word_length = 0.0
-
-    for original, result in zip(originals, results):
-        total_cer += levenshtein(original, result)
-        total_char_length += len(original)
-
-        total_wer += levenshtein(original.split(), result.split())
-        total_word_length += len(original.split())
-
-    return total_wer / total_word_length, total_cer / total_char_length
-
-
 # The following code is from: http://hetland.org/coding/python/levenshtein.py
 
 # This is a straightforward implementation of a well-known algorithm, and thus