Use tf.contrib.layers.dense_to_sparse instead of util/ctc.py

2019-02-04 09:19:48 -02:00 · 2019-02-04 09:19:48 -02:00 · f3613da82a
commit f3613da82a
parent 7a14bcc4de
3 changed files with 17 additions and 64 deletions
--- a/evaluate.py
+++ b/evaluate.py
@ -19,7 +19,6 @@ from multiprocessing import Pool, cpu_count
 from six.moves import zip, range
 from util.audio import audiofile_to_input_vector
 from util.config import Config, initialize_globals
-from util.ctc import ctc_label_dense_to_sparse
 from util.flags import create_flags, FLAGS
 from util.logging import log_error
 from util.preprocess import pmap, preprocess
@ -111,7 +110,14 @@ def evaluate(test_data, inference_graph):
        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

-        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
+        # We add 1 to all elements of the transcript to avoid any zero values
+        # since we use that as an end-of-sequence token for converting the batch
+        # into a SparseTensor. So here we convert the placeholder back into a
+        # SparseTensor and subtract ones to get the real labels.
+        sparse_labels = tf.contrib.layers.dense_to_sparse(labels_ph)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        sparse_labels = tf.sparse_add(sparse_labels, neg_ones)
+
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])
@ -143,7 +149,7 @@ def evaluate(test_data, inference_graph):

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
-            labels = pad_to_dense(batch['transcript'].values)
+            labels = pad_to_dense(batch['transcript'].values + 1)
            label_lengths = batch['transcript_len'].values

            logits, loss_ = session.run([transposed, loss], feed_dict={
--- a/util/ctc.py
+++ b/util/ctc.py
@ -1,57 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import tensorflow as tf
-
-from functools import reduce
-from six.moves import range
-
-
-# gather_nd is taken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
-#
-# Unfortunately we can't just use tf.gather_nd because it does not have gradients
-# implemented yet, so we need this workaround.
-#
-def gather_nd(params, indices, shape):
-    rank = len(shape)
-    flat_params = tf.reshape(params, [-1])
-    multipliers = [reduce(lambda x, y: x*y, shape[i+1:], 1) for i in range(0, rank)]
-    indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + list(range(0, rank - 1))))
-    flat_indices = sum([a*b for a,b in zip(multipliers, indices_unpacked)])
-    return tf.gather(flat_params, flat_indices)
-
-
-# ctc_label_dense_to_sparse is taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
-#
-# The CTC implementation in TensorFlow needs labels in a sparse representation,
-# but sparse data and queues don't mix well, so we store padded tensors in the
-# queue and convert to a sparse representation after dequeuing a batch.
-#
-def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
-    # The second dimension of labels must be equal to the longest label length in the batch
-    correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
-    with tf.control_dependencies([correct_shape_assert]):
-        labels = tf.identity(labels)
-
-    label_shape = tf.shape(labels)
-    num_batches_tns = tf.stack([label_shape[0]])
-    max_num_labels_tns = tf.stack([label_shape[1]])
-    def range_less_than(previous_state, current_input):
-        return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
-
-    init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
-    init = tf.expand_dims(init, 0)
-    dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
-    dense_mask = dense_mask[:, 0, :]
-
-    label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns),
-          label_shape)
-    label_ind = tf.boolean_mask(label_array, dense_mask)
-
-    batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), tf.reverse(label_shape, [0])))
-    batch_ind = tf.boolean_mask(batch_array, dense_mask)
-
-    indices = tf.transpose(tf.reshape(tf.concat([batch_ind, label_ind], 0), [2, -1]))
-    shape = [batch_size, tf.reduce_max(label_lengths)]
-    vals_sparse = gather_nd(labels, indices, shape)
-
-    return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))
--- a/util/feeding.py
+++ b/util/feeding.py
@ -4,7 +4,6 @@ import tensorflow as tf
 from math import ceil
 from six.moves import range
 from threading import Thread
-from util.ctc import ctc_label_dense_to_sparse
 from util.gpu import get_available_gpus


@ -143,11 +142,14 @@ class _DataSetLoader(object):
                (features.strides[0], features.strides[0], features.strides[1]),
                writeable=False)

+            # We add 1 to all elements of the transcript here to avoid any zero
+            # values since we use that as an end-of-sequence token for converting
+            # the batch into a SparseTensor.
            try:
                session.run(self._enqueue_op, feed_dict={
                    self._model_feeder.ph_x: features,
                    self._model_feeder.ph_x_length: num_strides,
-                    self._model_feeder.ph_y: transcript,
+                    self._model_feeder.ph_y: transcript + 1,
                    self._model_feeder.ph_y_length: transcript_len
                })
            except tf.errors.CancelledError:
@ -173,8 +175,10 @@ class _TowerFeeder(object):
        Draw the next batch from from the combined switchable queue.
        '''
        source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size)
-        sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size)
-        return source, source_lengths, sparse_labels
+        # Back to sparse, then subtract one to get the real labels
+        sparse_labels = tf.contrib.layers.dense_to_sparse(target)
+        neg_ones = tf.SparseTensor(sparse_labels.indices, -1 * tf.ones_like(sparse_labels.values), sparse_labels.dense_shape)
+        return source, source_lengths, tf.sparse_add(sparse_labels, neg_ones)

    def start_queue_threads(self, session, coord):
        '''