From 7a14bcc4deebd5f4612587381a01cb97686d9155 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Mon, 28 Jan 2019 10:31:00 -0200
Subject: [PATCH] Clean up and split TensorFlow deps of text.py

---
 evaluate.py     |  24 ++++------
 util/ctc.py     |  57 +++++++++++++++++++++++
 util/feeding.py |   2 +-
 util/text.py    | 120 +++++++-----------------------------------------
 4 files changed, 85 insertions(+), 118 deletions(-)
 create mode 100644 util/ctc.py

diff --git a/evaluate.py b/evaluate.py
index ebbbd4f8..a93b8687 100755
--- a/evaluate.py
+++ b/evaluate.py
@@ -19,10 +19,11 @@ from multiprocessing import Pool, cpu_count
 from six.moves import zip, range
 from util.audio import audiofile_to_input_vector
 from util.config import Config, initialize_globals
+from util.ctc import ctc_label_dense_to_sparse
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
 from util.preprocess import pmap, preprocess
-from util.text import Alphabet, ctc_label_dense_to_sparse, wer, levenshtein
+from util.text import Alphabet, wer_cer_batch, levenshtein
 
 
 def split_data(dataset, batch_size):
@@ -47,15 +48,14 @@ def pad_to_dense(jagged):
 
 def process_decode_result(item):
     label, decoding, distance, loss = item
-    sample_wer = wer(label, decoding)
+    word_distance = levenshtein(label.split(), decoding.split())
+    word_length = float(len(label.split()))
     return AttrDict({
         'src': label,
         'res': decoding,
         'loss': loss,
         'distance': distance,
-        'wer': sample_wer,
-        'levenshtein': levenshtein(label.split(), decoding.split()),
-        'label_length': float(len(label.split())),
+        'wer': word_distance / word_length,
     })
 
 
@@ -67,11 +67,8 @@ def calculate_report(labels, decodings, distances, losses):
     '''
     samples = pmap(process_decode_result, zip(labels, decodings, distances, losses))
 
-    total_levenshtein = sum(s.levenshtein for s in samples)
-    total_label_length = sum(s.label_length for s in samples)
-
-    # Getting the WER from the accumulated levenshteins and lengths
-    samples_wer = total_levenshtein / total_label_length
+    # Getting the WER and CER from the accumulated edit distances and lengths
+    samples_wer, samples_cer = wer_cer_batch(labels, decodings)
 
     # Order the remaining items by their loss (lowest loss on top)
     samples.sort(key=lambda s: s.loss)
@@ -79,7 +76,7 @@ def calculate_report(labels, decodings, distances, losses):
     # Then order by WER (highest WER on top)
     samples.sort(key=lambda s: s.wer, reverse=True)
 
-    return samples_wer, samples
+    return samples_wer, samples_cer, samples
 
 
 def evaluate(test_data, inference_graph):
@@ -183,15 +180,14 @@ def evaluate(test_data, inference_graph):
 
     distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]
 
-    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
-    mean_edit_distance = np.mean(distances)
+    wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses)
     mean_loss = np.mean(losses)
 
     # Take only the first report_count items
     report_samples = itertools.islice(samples, FLAGS.report_count)
 
     print('Test - WER: %f, CER: %f, loss: %f' %
-          (wer, mean_edit_distance, mean_loss))
+          (wer, cer, mean_loss))
     print('-' * 80)
     for sample in report_samples:
         print('WER: %f, CER: %f, loss: %f' %
diff --git a/util/ctc.py b/util/ctc.py
new file mode 100644
index 00000000..c6098ca2
--- /dev/null
+++ b/util/ctc.py
@@ -0,0 +1,57 @@
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+from functools import reduce
+from six.moves import range
+
+
+# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
+#
+# Unfortunately we can't just use tf.gather_nd because it does not have gradients
+# implemented yet, so we need this workaround.
+#
+def gather_nd(params, indices, shape):
+    rank = len(shape)
+    flat_params = tf.reshape(params, [-1])
+    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
+    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
+    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
+    return tf.gather(flat_params, flat_indices)
+
+
+# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
+#
+# The CTC implementation in TensorFlow needs labels in a sparse representation,
+# but sparse data and queues don't mix well, so we store padded tensors in the
+# queue and convert to a sparse representation after dequeuing a batch.
+#
+def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
+    # The second dimension of labels must be equal to the longest label length in the batch
+    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
+    with tf.control_dependencies([correct_shape_assert]):
+        labels = tf.identity(labels)
+
+    label_shape = tf.shape(labels)
+    num_batches_tns = tf.stack([label_shape[0]])
+    max_num_labels_tns = tf.stack([label_shape[1]])
+    def range_less_than(previous_state, current_input):
+        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
+
+    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
+    init = tf.expand_dims(init, 0)
+    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
+    dense_mask = dense_mask[:, 0, :]
+
+    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
+          label_shape)
+    label_ind = tf.boolean_mask(label_array, dense_mask)
+
+    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
+    batch_ind = tf.boolean_mask(batch_array, dense_mask)
+
+    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
+    shape = [batch_size, tf.reduce_max(label_lengths)]
+    vals_sparse = gather_nd(labels, indices, shape)
+
+    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
\ No newline at end of file
diff --git a/util/feeding.py b/util/feeding.py
index 26851bb7..fbaffa21 100644
--- a/util/feeding.py
+++ b/util/feeding.py
@@ -4,8 +4,8 @@ import tensorflow as tf
 from math import ceil
 from six.moves import range
 from threading import Thread
+from util.ctc import ctc_label_dense_to_sparse
 from util.gpu import get_available_gpus
-from util.text import ctc_label_dense_to_sparse
 
 
 class ModelFeeder(object):
diff --git a/util/text.py b/util/text.py
index 9e2b64cd..a3629429 100644
--- a/util/text.py
+++ b/util/text.py
@@ -2,12 +2,10 @@ from __future__ import absolute_import, division, print_function
 
 import codecs
 import numpy as np
-import tensorflow as tf
 import re
 import sys
 
 from six.moves import range
-from functools import reduce
 
 class Alphabet(object):
     def __init__(self, config_file):
@@ -56,6 +54,7 @@ class Alphabet(object):
     def config_file(self):
         return self._config_file
 
+
 def text_to_char_array(original, alphabet):
     r"""
     Given a Python string ``original``, remove unsupported characters, map characters
@@ -63,44 +62,8 @@ def text_to_char_array(original, alphabet):
     """
     return np.asarray([alphabet.label_from_string(c) for c in original])
 
-def sparse_tuple_from(sequences, dtype=np.int32):
-    r"""Creates a sparse representention of ``sequences``.
-    Args:
-        * sequences: a list of lists of type dtype where each element is a sequence
 
-    Returns a tuple with (indices, values, shape)
-    """
-    indices = []
-    values = []
-
-    for n, seq in enumerate(sequences):
-        indices.extend(zip([n]*len(seq), range(len(seq))))
-        values.extend(seq)
-
-    indices = np.asarray(indices, dtype=np.int64)
-    values = np.asarray(values, dtype=dtype)
-    shape = np.asarray([len(sequences), indices.max(0)[1]+1], dtype=np.int64)
-
-    return tf.SparseTensor(indices=indices, values=values, shape=shape)
-
-def sparse_tensor_value_to_texts(value, alphabet):
-    r"""
-    Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
-    representing its values.
-    """
-    return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape), alphabet)
-
-def sparse_tuple_to_texts(tuple, alphabet):
-    indices = tuple[0]
-    values = tuple[1]
-    results = [''] * tuple[2][0]
-    for i in range(len(indices)):
-        index = indices[i][0]
-        results[index] += alphabet.string_from_label(values[i])
-    # List of strings
-    return results
-
-def wer(original, result):
+def wer_cer_batch(originals, results):
     r"""
     The WER is defined as the editing/Levenshtein distance on word level
     divided by the amount of words in the original text.
@@ -108,22 +71,22 @@ def wer(original, result):
     being totally different (all N words resulting in 1 edit operation each),
     the WER will always be 1 (N / N = 1).
     """
-    # The WER ist calculated on word (and NOT on character) level.
-    # Therefore we split the strings into words first:
-    original = original.split()
-    result = result.split()
-    return levenshtein(original, result) / float(len(original))
+    # The WER is calculated on word (and NOT on character) level.
+    # Therefore we split the strings into words first
+    assert len(originals) == len(results)
 
-def wers(originals, results):
-    count = len(originals)
-    rates = []
-    mean = 0.0
-    assert count == len(results)
-    for i in range(count):
-        rate = wer(originals[i], results[i])
-        mean = mean + rate
-        rates.append(rate)
-    return rates, mean / float(count)
+    total_cer = 0.0
+
+    total_wer = 0.0
+    total_word_length = 0.0
+
+    for original, result in zip(originals, results):
+        total_cer += levenshtein(original, result)
+
+        total_wer += levenshtein(original.split(), result.split())
+        total_word_length += len(original.split())
+
+    return total_wer / total_word_length, total_cer / len(originals)
 
 # The following code is from: http://hetland.org/coding/python/levenshtein.py
 
@@ -155,55 +118,6 @@ def levenshtein(a,b):
 
     return current[n]
 
-# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
-# 
-# Unfortunately we can't just use tf.gather_nd because it does not have gradients
-# implemented yet, so we need this workaround.
-#
-def gather_nd(params, indices, shape):
-    rank = len(shape)
-    flat_params = tf.reshape(params, [-1])
-    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
-    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
-    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
-    return tf.gather(flat_params, flat_indices)
-
-# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
-#
-# The CTC implementation in TensorFlow needs labels in a sparse representation,
-# but sparse data and queues don't mix well, so we store padded tensors in the
-# queue and convert to a sparse representation after dequeuing a batch.
-#
-def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
-    # The second dimension of labels must be equal to the longest label length in the batch
-    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
-    with tf.control_dependencies([correct_shape_assert]):
-        labels = tf.identity(labels)
-
-    label_shape = tf.shape(labels)
-    num_batches_tns = tf.stack([label_shape[0]])
-    max_num_labels_tns = tf.stack([label_shape[1]])
-    def range_less_than(previous_state, current_input):
-        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
-
-    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
-    init = tf.expand_dims(init, 0)
-    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
-    dense_mask = dense_mask[:, 0, :]
-
-    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
-          label_shape)
-    label_ind = tf.boolean_mask(label_array, dense_mask)
-
-    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
-    batch_ind = tf.boolean_mask(batch_array, dense_mask)
-
-    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
-    shape = [batch_size, tf.reduce_max(label_lengths)]
-    vals_sparse = gather_nd(labels, indices, shape)
-
-    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
-
 # Validate and normalize transcriptions. Returns a cleaned version of the label
 # or None if it's invalid.
 def validate_label(label):