Merge pull request #2111 from mozilla/test-epoch-oom

Revert to a pipelined approach for test epochs to avoid CPU OOM with large alphabets
This commit is contained in:
Reuben Morais 2019-05-14 18:57:30 +00:00 committed by GitHub
commit df5bb31046
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 69 deletions

View File

@ -19,7 +19,6 @@ from util.evaluate_tools import calculate_report
from util.feeding import create_dataset from util.feeding import create_dataset
from util.flags import create_flags, FLAGS from util.flags import create_flags, FLAGS
from util.logging import log_error, log_progress, create_progressbar from util.logging import log_error, log_progress, create_progressbar
from util.text import levenshtein
def sparse_tensor_value_to_texts(value, alphabet): def sparse_tensor_value_to_texts(value, alphabet):
@ -88,14 +87,13 @@ def evaluate(test_csvs, create_model, try_loading):
exit(1) exit(1)
def run_test(init_op, dataset): def run_test(init_op, dataset):
logitses = []
losses = [] losses = []
seq_lengths = [] predictions = []
ground_truths = [] ground_truths = []
bar = create_progressbar(prefix='Computing acoustic model predictions | ', bar = create_progressbar(prefix='Test epoch | ',
widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start() widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start()
log_progress('Computing acoustic model predictions...') log_progress('Test epoch...')
step_count = 0 step_count = 0
@ -105,35 +103,23 @@ def evaluate(test_csvs, create_model, try_loading):
# First pass, compute losses and transposed logits for decoding # First pass, compute losses and transposed logits for decoding
while True: while True:
try: try:
logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y]) batch_logits, batch_loss, batch_lengths, batch_transcripts = \
session.run([transposed, loss, batch_x_len, batch_y])
except tf.errors.OutOfRangeError: except tf.errors.OutOfRangeError:
break break
decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width,
num_processes=num_processes, scorer=scorer)
predictions.extend(d[0][1] for d in decoded)
ground_truths.extend(sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet))
losses.extend(batch_loss)
step_count += 1 step_count += 1
bar.update(step_count) bar.update(step_count)
logitses.append(logits)
losses.extend(loss_)
seq_lengths.append(lengths)
ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet))
bar.finish() bar.finish()
predictions = [] wer, cer, samples = calculate_report(ground_truths, predictions, losses)
bar = create_progressbar(max_value=step_count,
prefix='Decoding predictions | ').start()
log_progress('Decoding predictions...')
# Second pass, decode logits and compute WER and edit distance metrics
for logits, seq_length in bar(zip(logitses, seq_lengths)):
decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width,
num_processes=num_processes, scorer=scorer)
predictions.extend(d[0][1] for d in decoded)
distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
mean_loss = np.mean(losses) mean_loss = np.mean(losses)
# Take only the first report_count items # Take only the first report_count items
@ -144,7 +130,7 @@ def evaluate(test_csvs, create_model, try_loading):
print('-' * 80) print('-' * 80)
for sample in report_samples: for sample in report_samples:
print('WER: %f, CER: %f, loss: %f' % print('WER: %f, CER: %f, loss: %f' %
(sample.wer, sample.distance, sample.loss)) (sample.wer, sample.cer, sample.loss))
print(' - src: "%s"' % sample.src) print(' - src: "%s"' % sample.src)
print(' - res: "%s"' % sample.res) print(' - res: "%s"' % sample.res)
print('-' * 80) print('-' * 80)

View File

@ -13,7 +13,6 @@ from six.moves import zip, range
from multiprocessing import JoinableQueue, Pool, Process, Queue, cpu_count from multiprocessing import JoinableQueue, Pool, Process, Queue, cpu_count
from deepspeech import Model from deepspeech import Model
from util.text import levenshtein
from util.evaluate_tools import process_decode_result, calculate_report from util.evaluate_tools import process_decode_result, calculate_report
r''' r'''
@ -96,9 +95,7 @@ def main():
ground_truths.append(msg['ground_truth']) ground_truths.append(msg['ground_truth'])
predictions.append(msg['prediction']) predictions.append(msg['prediction'])
distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] wer, cer, samples = calculate_report(ground_truths, predictions, losses)
wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
mean_loss = np.mean(losses) mean_loss = np.mean(losses)
print('Test - WER: %f, CER: %f, loss: %f' % print('Test - WER: %f, CER: %f, loss: %f' %

View File

@ -6,7 +6,8 @@ from multiprocessing.dummy import Pool
from attrdict import AttrDict from attrdict import AttrDict
from util.text import wer_cer_batch, levenshtein from util.text import levenshtein
def pmap(fun, iterable): def pmap(fun, iterable):
pool = Pool() pool = Pool()
@ -14,29 +15,53 @@ def pmap(fun, iterable):
pool.close() pool.close()
return results return results
def wer_cer_batch(samples):
r"""
The WER is defined as the edit/Levenshtein distance on word level divided by
the amount of words in the original text.
In case of the original having more words (N) than the result and both
being totally different (all N words resulting in 1 edit operation each),
the WER will always be 1 (N / N = 1).
"""
wer = sum(s.word_distance for s in samples) / sum(s.word_length for s in samples)
cer = sum(s.char_distance for s in samples) / sum(s.char_length for s in samples)
wer = min(wer, 1.0)
cer = min(cer, 1.0)
return wer, cer
def process_decode_result(item): def process_decode_result(item):
label, decoding, distance, loss = item ground_truth, prediction, loss = item
word_distance = levenshtein(label.split(), decoding.split()) char_distance = levenshtein(ground_truth, prediction)
word_length = float(len(label.split())) char_length = len(ground_truth)
word_distance = levenshtein(ground_truth.split(), prediction.split())
word_length = len(ground_truth.split())
return AttrDict({ return AttrDict({
'src': label, 'src': ground_truth,
'res': decoding, 'res': prediction,
'loss': loss, 'loss': loss,
'distance': distance, 'char_distance': char_distance,
'char_length': char_length,
'word_distance': word_distance,
'word_length': word_length,
'cer': char_distance / char_length,
'wer': word_distance / word_length, 'wer': word_distance / word_length,
}) })
def calculate_report(labels, decodings, distances, losses): def calculate_report(labels, decodings, losses):
r''' r'''
This routine will calculate a WER report. This routine will calculate a WER report.
It'll compute the `mean` WER and create ``Sample`` objects of the ``report_count`` top lowest It'll compute the `mean` WER and create ``Sample`` objects of the ``report_count`` top lowest
loss items from the provided WER results tuple (only items with WER!=0 and ordered by their WER). loss items from the provided WER results tuple (only items with WER!=0 and ordered by their WER).
''' '''
samples = pmap(process_decode_result, zip(labels, decodings, distances, losses)) samples = pmap(process_decode_result, zip(labels, decodings, losses))
# Getting the WER and CER from the accumulated edit distances and lengths # Getting the WER and CER from the accumulated edit distances and lengths
samples_wer, samples_cer = wer_cer_batch(labels, decodings) samples_wer, samples_cer = wer_cer_batch(samples)
# Order the remaining items by their loss (lowest loss on top) # Order the remaining items by their loss (lowest loss on top)
samples.sort(key=lambda s: s.loss) samples.sort(key=lambda s: s.loss)

View File

@ -55,34 +55,6 @@ def text_to_char_array(original, alphabet):
return np.asarray([alphabet.label_from_string(c) for c in original]) return np.asarray([alphabet.label_from_string(c) for c in original])
def wer_cer_batch(originals, results):
r"""
The WER is defined as the editing/Levenshtein distance on word level
divided by the amount of words in the original text.
In case of the original having more words (N) than the result and both
being totally different (all N words resulting in 1 edit operation each),
the WER will always be 1 (N / N = 1).
"""
# The WER is calculated on word (and NOT on character) level.
# Therefore we split the strings into words first
assert len(originals) == len(results)
total_cer = 0.0
total_char_length = 0.0
total_wer = 0.0
total_word_length = 0.0
for original, result in zip(originals, results):
total_cer += levenshtein(original, result)
total_char_length += len(original)
total_wer += levenshtein(original.split(), result.split())
total_word_length += len(original.split())
return total_wer / total_word_length, total_cer / total_char_length
# The following code is from: http://hetland.org/coding/python/levenshtein.py # The following code is from: http://hetland.org/coding/python/levenshtein.py
# This is a straightforward implementation of a well-known algorithm, and thus # This is a straightforward implementation of a well-known algorithm, and thus