Clean up and split TensorFlow deps of text.py

This commit is contained in:
Reuben Morais 2019-01-28 10:31:00 -02:00
parent 3378008f5d
commit 7a14bcc4de
4 changed files with 85 additions and 118 deletions

View File

@ -19,10 +19,11 @@ from multiprocessing import Pool, cpu_count
from six.moves import zip, range from six.moves import zip, range
from util.audio import audiofile_to_input_vector from util.audio import audiofile_to_input_vector
from util.config import Config, initialize_globals from util.config import Config, initialize_globals
from util.ctc import ctc_label_dense_to_sparse
from util.flags import create_flags, FLAGS from util.flags import create_flags, FLAGS
from util.logging import log_error from util.logging import log_error
from util.preprocess import pmap, preprocess from util.preprocess import pmap, preprocess
from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein from util.text import Alphabet, wer_cer_batch, levenshtein
def split_data(dataset, batch_size): def split_data(dataset, batch_size):
@ -47,15 +48,14 @@ def pad_to_dense(jagged):
def process_decode_result(item): def process_decode_result(item):
label, decoding, distance, loss = item label, decoding, distance, loss = item
sample_wer = wer(label, decoding) word_distance = levenshtein(label.split(), decoding.split())
word_length = float(len(label.split()))
return AttrDict({ return AttrDict({
'src': label, 'src': label,
'res': decoding, 'res': decoding,
'loss': loss, 'loss': loss,
'distance': distance, 'distance': distance,
'wer': sample_wer, 'wer': word_distance / word_length,
'levenshtein': levenshtein(label.split(), decoding.split()),
'label_length': float(len(label.split())),
}) })
@ -67,11 +67,8 @@ def calculate_report(labels, decodings, distances, losses):
''' '''
samples = pmap(process_decode_result, zip(labels, decodings, distances, losses)) samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
total_levenshtein = sum(s.levenshtein for s in samples) # Getting the WER and CER from the accumulated edit distances and lengths
total_label_length = sum(s.label_length for s in samples) samples_wer, samples_cer = wer_cer_batch(labels, decodings)
# Getting the WER from the accumulated levenshteins and lengths
samples_wer = total_levenshtein / total_label_length
# Order the remaining items by their loss (lowest loss on top) # Order the remaining items by their loss (lowest loss on top)
samples.sort(key=lambda s: s.loss) samples.sort(key=lambda s: s.loss)
@ -79,7 +76,7 @@ def calculate_report(labels, decodings, distances, losses):
# Then order by WER (highest WER on top) # Then order by WER (highest WER on top)
samples.sort(key=lambda s: s.wer, reverse=True) samples.sort(key=lambda s: s.wer, reverse=True)
return samples_wer, samples return samples_wer, samples_cer, samples
def evaluate(test_data, inference_graph): def evaluate(test_data, inference_graph):
@ -183,15 +180,14 @@ def evaluate(test_data, inference_graph):
distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
wer, samples = calculate_report(ground_truths, predictions, distances, losses) wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
mean_edit_distance = np.mean(distances)
mean_loss = np.mean(losses) mean_loss = np.mean(losses)
# Take only the first report_count items # Take only the first report_count items
report_samples = itertools.islice(samples, FLAGS.report_count) report_samples = itertools.islice(samples, FLAGS.report_count)
print('Test - WER: %f, CER: %f, loss: %f' % print('Test - WER: %f, CER: %f, loss: %f' %
(wer, mean_edit_distance, mean_loss)) (wer, cer, mean_loss))
print('-' * 80) print('-' * 80)
for sample in report_samples: for sample in report_samples:
print('WER: %f, CER: %f, loss: %f' % print('WER: %f, CER: %f, loss: %f' %

57
util/ctc.py Normal file
View File

@ -0,0 +1,57 @@
from __future__ import absolute_import, division, print_function
import tensorflow as tf
from functools import reduce
from six.moves import range
# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
#
# Unfortunately we can't just use tf.gather_nd because it does not have gradients
# implemented yet, so we need this workaround.
#
def gather_nd(params, indices, shape):
rank = len(shape)
flat_params = tf.reshape(params, [-1])
multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
return tf.gather(flat_params, flat_indices)
# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
#
# The CTC implementation in TensorFlow needs labels in a sparse representation,
# but sparse data and queues don't mix well, so we store padded tensors in the
# queue and convert to a sparse representation after dequeuing a batch.
#
def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
# The second dimension of labels must be equal to the longest label length in the batch
correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
with tf.control_dependencies([correct_shape_assert]):
labels = tf.identity(labels)
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(previous_state, current_input):
return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
init = tf.expand_dims(init, 0)
dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
label_shape)
label_ind = tf.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
batch_ind = tf.boolean_mask(batch_array, dense_mask)
indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
shape = [batch_size, tf.reduce_max(label_lengths)]
vals_sparse = gather_nd(labels, indices, shape)
return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))

View File

@ -4,8 +4,8 @@ import tensorflow as tf
from math import ceil from math import ceil
from six.moves import range from six.moves import range
from threading import Thread from threading import Thread
from util.ctc import ctc_label_dense_to_sparse
from util.gpu import get_available_gpus from util.gpu import get_available_gpus
from util.text import ctc_label_dense_to_sparse
class ModelFeeder(object): class ModelFeeder(object):

View File

@ -2,12 +2,10 @@ from __future__ import absolute_import, division, print_function
import codecs import codecs
import numpy as np import numpy as np
import tensorflow as tf
import re import re
import sys import sys
from six.moves import range from six.moves import range
from functools import reduce
class Alphabet(object): class Alphabet(object):
def __init__(self, config_file): def __init__(self, config_file):
@ -56,6 +54,7 @@ class Alphabet(object):
def config_file(self): def config_file(self):
return self._config_file return self._config_file
def text_to_char_array(original, alphabet): def text_to_char_array(original, alphabet):
r""" r"""
Given a Python string ``original``, remove unsupported characters, map characters Given a Python string ``original``, remove unsupported characters, map characters
@ -63,44 +62,8 @@ def text_to_char_array(original, alphabet):
""" """
return np.asarray([alphabet.label_from_string(c) for c in original]) return np.asarray([alphabet.label_from_string(c) for c in original])
def sparse_tuple_from(sequences, dtype=np.int32):
r"""Creates a sparse representention of ``sequences``.
Args:
* sequences: a list of lists of type dtype where each element is a sequence
Returns a tuple with (indices, values, shape) def wer_cer_batch(originals, results):
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n]*len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
return tf.SparseTensor(indices=indices, values=values, shape=shape)
def sparse_tensor_value_to_texts(value, alphabet):
r"""
Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
representing its values.
"""
return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet)
def sparse_tuple_to_texts(tuple, alphabet):
indices = tuple[0]
values = tuple[1]
results = [''] * tuple[2][0]
for i in range(len(indices)):
index = indices[i][0]
results[index] += alphabet.string_from_label(values[i])
# List of strings
return results
def wer(original, result):
r""" r"""
The WER is defined as the editing/Levenshtein distance on word level The WER is defined as the editing/Levenshtein distance on word level
divided by the amount of words in the original text. divided by the amount of words in the original text.
@ -108,22 +71,22 @@ def wer(original, result):
being totally different (all N words resulting in 1 edit operation each), being totally different (all N words resulting in 1 edit operation each),
the WER will always be 1 (N / N = 1). the WER will always be 1 (N / N = 1).
""" """
# The WER ist calculated on word (and NOT on character) level. # The WER is calculated on word (and NOT on character) level.
# Therefore we split the strings into words first: # Therefore we split the strings into words first
original = original.split() assert len(originals) == len(results)
result = result.split()
return levenshtein(original, result) / float(len(original))
def wers(originals, results): total_cer = 0.0
count = len(originals)
rates = [] total_wer = 0.0
mean = 0.0 total_word_length = 0.0
assert count == len(results)
for i in range(count): for original, result in zip(originals, results):
rate = wer(originals[i], results[i]) total_cer += levenshtein(original, result)
mean = mean + rate
rates.append(rate) total_wer += levenshtein(original.split(), result.split())
return rates, mean / float(count) total_word_length += len(original.split())
return total_wer / total_word_length, total_cer / len(originals)
# The following code is from: http://hetland.org/coding/python/levenshtein.py # The following code is from: http://hetland.org/coding/python/levenshtein.py
@ -155,55 +118,6 @@ def levenshtein(a,b):
return current[n] return current[n]
# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
#
# Unfortunately we can't just use tf.gather_nd because it does not have gradients
# implemented yet, so we need this workaround.
#
def gather_nd(params, indices, shape):
rank = len(shape)
flat_params = tf.reshape(params, [-1])
multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
return tf.gather(flat_params, flat_indices)
# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
#
# The CTC implementation in TensorFlow needs labels in a sparse representation,
# but sparse data and queues don't mix well, so we store padded tensors in the
# queue and convert to a sparse representation after dequeuing a batch.
#
def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
# The second dimension of labels must be equal to the longest label length in the batch
correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
with tf.control_dependencies([correct_shape_assert]):
labels = tf.identity(labels)
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(previous_state, current_input):
return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
init = tf.expand_dims(init, 0)
dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
label_shape)
label_ind = tf.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
batch_ind = tf.boolean_mask(batch_array, dense_mask)
indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
shape = [batch_size, tf.reduce_max(label_lengths)]
vals_sparse = gather_nd(labels, indices, shape)
return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
# Validate and normalize transcriptions. Returns a cleaned version of the label # Validate and normalize transcriptions. Returns a cleaned version of the label
# or None if it's invalid. # or None if it's invalid.
def validate_label(label): def validate_label(label):