From 699e4ebcd7b988d76428057db2775a461cb8b962 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Sat, 11 May 2019 08:15:28 -0300 Subject: [PATCH] Revert to a pipelined approach for test epochs to avoid CPU OOM with large alphabets --- evaluate.py | 40 ++++++++++++------------------------- evaluate_tflite.py | 5 +---- util/evaluate_tools.py | 45 ++++++++++++++++++++++++++++++++---------- util/text.py | 28 -------------------------- 4 files changed, 49 insertions(+), 69 deletions(-) diff --git a/evaluate.py b/evaluate.py index 2dc767f8..6e7033fe 100755 --- a/evaluate.py +++ b/evaluate.py @@ -19,7 +19,6 @@ from util.evaluate_tools import calculate_report from util.feeding import create_dataset from util.flags import create_flags, FLAGS from util.logging import log_error, log_progress, create_progressbar -from util.text import levenshtein def sparse_tensor_value_to_texts(value, alphabet): @@ -88,14 +87,13 @@ def evaluate(test_csvs, create_model, try_loading): exit(1) def run_test(init_op, dataset): - logitses = [] losses = [] - seq_lengths = [] + predictions = [] ground_truths = [] - bar = create_progressbar(prefix='Computing acoustic model predictions | ', + bar = create_progressbar(prefix='Test epoch | ', widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start() - log_progress('Computing acoustic model predictions...') + log_progress('Test epoch...') step_count = 0 @@ -105,35 +103,23 @@ def evaluate(test_csvs, create_model, try_loading): # First pass, compute losses and transposed logits for decoding while True: try: - logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y]) + batch_logits, batch_loss, batch_lengths, batch_transcripts = \ + session.run([transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break + decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, + num_processes=num_processes, scorer=scorer) + predictions.extend(d[0][1] for d in decoded) + ground_truths.extend(sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet)) + losses.extend(batch_loss) + step_count += 1 bar.update(step_count) - logitses.append(logits) - losses.extend(loss_) - seq_lengths.append(lengths) - ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet)) - bar.finish() - predictions = [] - - bar = create_progressbar(max_value=step_count, - prefix='Decoding predictions | ').start() - log_progress('Decoding predictions...') - - # Second pass, decode logits and compute WER and edit distance metrics - for logits, seq_length in bar(zip(logitses, seq_lengths)): - decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width, - num_processes=num_processes, scorer=scorer) - predictions.extend(d[0][1] for d in decoded) - - distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] - - wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses) + wer, cer, samples = calculate_report(ground_truths, predictions, losses) mean_loss = np.mean(losses) # Take only the first report_count items @@ -144,7 +130,7 @@ def evaluate(test_csvs, create_model, try_loading): print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % - (sample.wer, sample.distance, sample.loss)) + (sample.wer, sample.cer, sample.loss)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) diff --git a/evaluate_tflite.py b/evaluate_tflite.py index 23836dba..12aaf03d 100644 --- a/evaluate_tflite.py +++ b/evaluate_tflite.py @@ -13,7 +13,6 @@ from six.moves import zip, range from multiprocessing import JoinableQueue, Pool, Process, Queue, cpu_count from deepspeech import Model -from util.text import levenshtein from util.evaluate_tools import process_decode_result, calculate_report r''' @@ -96,9 +95,7 @@ def main(): ground_truths.append(msg['ground_truth']) predictions.append(msg['prediction']) - distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] - - wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses) + wer, cer, samples = calculate_report(ground_truths, predictions, losses) mean_loss = np.mean(losses) print('Test - WER: %f, CER: %f, loss: %f' % diff --git a/util/evaluate_tools.py b/util/evaluate_tools.py index d3939294..1ad91f46 100644 --- a/util/evaluate_tools.py +++ b/util/evaluate_tools.py @@ -6,7 +6,8 @@ from multiprocessing.dummy import Pool from attrdict import AttrDict -from util.text import wer_cer_batch, levenshtein +from util.text import levenshtein + def pmap(fun, iterable): pool = Pool() @@ -14,29 +15,53 @@ def pmap(fun, iterable): pool.close() return results + +def wer_cer_batch(samples): + r""" + The WER is defined as the edit/Levenshtein distance on word level divided by + the amount of words in the original text. + In case of the original having more words (N) than the result and both + being totally different (all N words resulting in 1 edit operation each), + the WER will always be 1 (N / N = 1). + """ + wer = sum(s.word_distance for s in samples) / sum(s.word_length for s in samples) + cer = sum(s.char_distance for s in samples) / sum(s.char_length for s in samples) + + wer = min(wer, 1.0) + cer = min(cer, 1.0) + + return wer, cer + + def process_decode_result(item): - label, decoding, distance, loss = item - word_distance = levenshtein(label.split(), decoding.split()) - word_length = float(len(label.split())) + ground_truth, prediction, loss = item + char_distance = levenshtein(ground_truth, prediction) + char_length = len(ground_truth) + word_distance = levenshtein(ground_truth.split(), prediction.split()) + word_length = len(ground_truth.split()) return AttrDict({ - 'src': label, - 'res': decoding, + 'src': ground_truth, + 'res': prediction, 'loss': loss, - 'distance': distance, + 'char_distance': char_distance, + 'char_length': char_length, + 'word_distance': word_distance, + 'word_length': word_length, + 'cer': char_distance / char_length, 'wer': word_distance / word_length, }) -def calculate_report(labels, decodings, distances, losses): +def calculate_report(labels, decodings, losses): r''' This routine will calculate a WER report. It'll compute the `mean` WER and create ``Sample`` objects of the ``report_count`` top lowest loss items from the provided WER results tuple (only items with WER!=0 and ordered by their WER). ''' - samples = pmap(process_decode_result, zip(labels, decodings, distances, losses)) + samples = pmap(process_decode_result, zip(labels, decodings, losses)) # Getting the WER and CER from the accumulated edit distances and lengths - samples_wer, samples_cer = wer_cer_batch(labels, decodings) + samples_wer, samples_cer = wer_cer_batch(samples) # Order the remaining items by their loss (lowest loss on top) samples.sort(key=lambda s: s.loss) diff --git a/util/text.py b/util/text.py index 207c79ea..7ae6ef3e 100644 --- a/util/text.py +++ b/util/text.py @@ -55,34 +55,6 @@ def text_to_char_array(original, alphabet): return np.asarray([alphabet.label_from_string(c) for c in original]) -def wer_cer_batch(originals, results): - r""" - The WER is defined as the editing/Levenshtein distance on word level - divided by the amount of words in the original text. - In case of the original having more words (N) than the result and both - being totally different (all N words resulting in 1 edit operation each), - the WER will always be 1 (N / N = 1). - """ - # The WER is calculated on word (and NOT on character) level. - # Therefore we split the strings into words first - assert len(originals) == len(results) - - total_cer = 0.0 - total_char_length = 0.0 - - total_wer = 0.0 - total_word_length = 0.0 - - for original, result in zip(originals, results): - total_cer += levenshtein(original, result) - total_char_length += len(original) - - total_wer += levenshtein(original.split(), result.split()) - total_word_length += len(original.split()) - - return total_wer / total_word_length, total_cer / total_char_length - - # The following code is from: http://hetland.org/coding/python/levenshtein.py # This is a straightforward implementation of a well-known algorithm, and thus