From 7a14bcc4deebd5f4612587381a01cb97686d9155 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 28 Jan 2019 10:31:00 -0200 Subject: [PATCH] Clean up and split TensorFlow deps of text.py --- evaluate.py | 24 ++++------ util/ctc.py | 57 +++++++++++++++++++++++ util/feeding.py | 2 +- util/text.py | 120 +++++++----------------------------------------- 4 files changed, 85 insertions(+), 118 deletions(-) create mode 100644 util/ctc.py diff --git a/evaluate.py b/evaluate.py index ebbbd4f8..a93b8687 100755 --- a/evaluate.py +++ b/evaluate.py @@ -19,10 +19,11 @@ from multiprocessing import Pool, cpu_count from six.moves import zip, range from util.audio import audiofile_to_input_vector from util.config import Config, initialize_globals +from util.ctc import ctc_label_dense_to_sparse from util.flags import create_flags, FLAGS from util.logging import log_error from util.preprocess import pmap, preprocess -from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein +from util.text import Alphabet, wer_cer_batch, levenshtein def split_data(dataset, batch_size): @@ -47,15 +48,14 @@ def pad_to_dense(jagged): def process_decode_result(item): label, decoding, distance, loss = item - sample_wer = wer(label, decoding) + word_distance = levenshtein(label.split(), decoding.split()) + word_length = float(len(label.split())) return AttrDict({ 'src': label, 'res': decoding, 'loss': loss, 'distance': distance, - 'wer': sample_wer, - 'levenshtein': levenshtein(label.split(), decoding.split()), - 'label_length': float(len(label.split())), + 'wer': word_distance / word_length, }) @@ -67,11 +67,8 @@ def calculate_report(labels, decodings, distances, losses): ''' samples = pmap(process_decode_result, zip(labels, decodings, distances, losses)) - total_levenshtein = sum(s.levenshtein for s in samples) - total_label_length = sum(s.label_length for s in samples) - - # Getting the WER from the accumulated levenshteins and lengths - samples_wer = total_levenshtein / total_label_length + # Getting the WER and CER from the accumulated edit distances and lengths + samples_wer, samples_cer = wer_cer_batch(labels, decodings) # Order the remaining items by their loss (lowest loss on top) samples.sort(key=lambda s: s.loss) @@ -79,7 +76,7 @@ def calculate_report(labels, decodings, distances, losses): # Then order by WER (highest WER on top) samples.sort(key=lambda s: s.wer, reverse=True) - return samples_wer, samples + return samples_wer, samples_cer, samples def evaluate(test_data, inference_graph): @@ -183,15 +180,14 @@ def evaluate(test_data, inference_graph): distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] - wer, samples = calculate_report(ground_truths, predictions, distances, losses) - mean_edit_distance = np.mean(distances) + wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, CER: %f, loss: %f' % - (wer, mean_edit_distance, mean_loss)) + (wer, cer, mean_loss)) print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % diff --git a/util/ctc.py b/util/ctc.py new file mode 100644 index 00000000..c6098ca2 --- /dev/null +++ b/util/ctc.py @@ -0,0 +1,57 @@ +from __future__ import absolute_import, division, print_function + +import tensorflow as tf + +from functools import reduce +from six.moves import range + + +# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962 +# +# Unfortunately we can't just use tf.gather_nd because it does not have gradients +# implemented yet, so we need this workaround. +# +def gather_nd(params, indices, shape): + rank = len(shape) + flat_params = tf.reshape(params, [-1]) + multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)] + indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1)))) + flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)]) + return tf.gather(flat_params, flat_indices) + + +# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527 +# +# The CTC implementation in TensorFlow needs labels in a sparse representation, +# but sparse data and queues don't mix well, so we store padded tensors in the +# queue and convert to a sparse representation after dequeuing a batch. +# +def ctc_label_dense_to_sparse(labels, label_lengths, batch_size): + # The second dimension of labels must be equal to the longest label length in the batch + correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths)) + with tf.control_dependencies([correct_shape_assert]): + labels = tf.identity(labels) + + label_shape = tf.shape(labels) + num_batches_tns = tf.stack([label_shape[0]]) + max_num_labels_tns = tf.stack([label_shape[1]]) + def range_less_than(previous_state, current_input): + return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input + + init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool) + init = tf.expand_dims(init, 0) + dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1) + dense_mask = dense_mask[:, 0, :] + + label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns), + label_shape) + label_ind = tf.boolean_mask(label_array, dense_mask) + + batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0]))) + batch_ind = tf.boolean_mask(batch_array, dense_mask) + + indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1])) + shape = [batch_size, tf.reduce_max(label_lengths)] + vals_sparse = gather_nd(labels, indices, shape) + + return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape)) \ No newline at end of file diff --git a/util/feeding.py b/util/feeding.py index 26851bb7..fbaffa21 100644 --- a/util/feeding.py +++ b/util/feeding.py @@ -4,8 +4,8 @@ import tensorflow as tf from math import ceil from six.moves import range from threading import Thread +from util.ctc import ctc_label_dense_to_sparse from util.gpu import get_available_gpus -from util.text import ctc_label_dense_to_sparse class ModelFeeder(object): diff --git a/util/text.py b/util/text.py index 9e2b64cd..a3629429 100644 --- a/util/text.py +++ b/util/text.py @@ -2,12 +2,10 @@ from __future__ import absolute_import, division, print_function import codecs import numpy as np -import tensorflow as tf import re import sys from six.moves import range -from functools import reduce class Alphabet(object): def __init__(self, config_file): @@ -56,6 +54,7 @@ class Alphabet(object): def config_file(self): return self._config_file + def text_to_char_array(original, alphabet): r""" Given a Python string ``original``, remove unsupported characters, map characters @@ -63,44 +62,8 @@ def text_to_char_array(original, alphabet): """ return np.asarray([alphabet.label_from_string(c) for c in original]) -def sparse_tuple_from(sequences, dtype=np.int32): - r"""Creates a sparse representention of ``sequences``. - Args: - * sequences: a list of lists of type dtype where each element is a sequence - Returns a tuple with (indices, values, shape) - """ - indices = [] - values = [] - - for n, seq in enumerate(sequences): - indices.extend(zip([n]*len(seq), range(len(seq)))) - values.extend(seq) - - indices = np.asarray(indices, dtype=np.int64) - values = np.asarray(values, dtype=dtype) - shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64) - - return tf.SparseTensor(indices=indices, values=values, shape=shape) - -def sparse_tensor_value_to_texts(value, alphabet): - r""" - Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings - representing its values. - """ - return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet) - -def sparse_tuple_to_texts(tuple, alphabet): - indices = tuple[0] - values = tuple[1] - results = [''] * tuple[2][0] - for i in range(len(indices)): - index = indices[i][0] - results[index] += alphabet.string_from_label(values[i]) - # List of strings - return results - -def wer(original, result): +def wer_cer_batch(originals, results): r""" The WER is defined as the editing/Levenshtein distance on word level divided by the amount of words in the original text. @@ -108,22 +71,22 @@ def wer(original, result): being totally different (all N words resulting in 1 edit operation each), the WER will always be 1 (N / N = 1). """ - # The WER ist calculated on word (and NOT on character) level. - # Therefore we split the strings into words first: - original = original.split() - result = result.split() - return levenshtein(original, result) / float(len(original)) + # The WER is calculated on word (and NOT on character) level. + # Therefore we split the strings into words first + assert len(originals) == len(results) -def wers(originals, results): - count = len(originals) - rates = [] - mean = 0.0 - assert count == len(results) - for i in range(count): - rate = wer(originals[i], results[i]) - mean = mean + rate - rates.append(rate) - return rates, mean / float(count) + total_cer = 0.0 + + total_wer = 0.0 + total_word_length = 0.0 + + for original, result in zip(originals, results): + total_cer += levenshtein(original, result) + + total_wer += levenshtein(original.split(), result.split()) + total_word_length += len(original.split()) + + return total_wer / total_word_length, total_cer / len(originals) # The following code is from: http://hetland.org/coding/python/levenshtein.py @@ -155,55 +118,6 @@ def levenshtein(a,b): return current[n] -# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962 -# -# Unfortunately we can't just use tf.gather_nd because it does not have gradients -# implemented yet, so we need this workaround. -# -def gather_nd(params, indices, shape): - rank = len(shape) - flat_params = tf.reshape(params, [-1]) - multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)] - indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1)))) - flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)]) - return tf.gather(flat_params, flat_indices) - -# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527 -# -# The CTC implementation in TensorFlow needs labels in a sparse representation, -# but sparse data and queues don't mix well, so we store padded tensors in the -# queue and convert to a sparse representation after dequeuing a batch. -# -def ctc_label_dense_to_sparse(labels, label_lengths, batch_size): - # The second dimension of labels must be equal to the longest label length in the batch - correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths)) - with tf.control_dependencies([correct_shape_assert]): - labels = tf.identity(labels) - - label_shape = tf.shape(labels) - num_batches_tns = tf.stack([label_shape[0]]) - max_num_labels_tns = tf.stack([label_shape[1]]) - def range_less_than(previous_state, current_input): - return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input - - init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool) - init = tf.expand_dims(init, 0) - dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1) - dense_mask = dense_mask[:, 0, :] - - label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns), - label_shape) - label_ind = tf.boolean_mask(label_array, dense_mask) - - batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0]))) - batch_ind = tf.boolean_mask(batch_array, dense_mask) - - indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1])) - shape = [batch_size, tf.reduce_max(label_lengths)] - vals_sparse = gather_nd(labels, indices, shape) - - return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape)) - # Validate and normalize transcriptions. Returns a cleaned version of the label # or None if it's invalid. def validate_label(label):