Add options to lookup layers to handle vocabularies from files.

PiperOrigin-RevId: 338487147 Change-Id: I1274f5e300bfdc5b1e9d338c2946dc2ad74b76d7
2020-10-22 09:23:52 -07:00 · 2020-10-22 09:23:52 -07:00 · 270db009cf
commit 270db009cf
parent 4fd222fe53
7 changed files with 112 additions and 1 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -293,6 +293,8 @@ This release contains contributions from many people at Google, as well as:
    *   Improvements to Keras preprocessing layers:
        *   TextVectorization can now accept a vocabulary list or file as an
            init arg.
+        *   TextVectorization, StringLookup, and IntegerLookup can now accept a
+            vocabulary file via the `set_vocab_from_file` method.
        *   Normalization can now accept mean and variance values as init args.
    *   In `Attention` and `AdditiveAttention` layers, the `call()` method now
        accepts a `return_attention_scores` argument. When set to
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@ -217,3 +217,9 @@ class IntegerLookup(index_lookup.IndexLookup):
    base_config["oov_value"] = base_config["oov_token"]
    del base_config["oov_token"]
    return base_config
+
+  def set_vocabulary(self, vocab):
+    if isinstance(vocab, str):
+      vocab = table_utils.get_vocabulary_from_file(vocab)
+      vocab = [int(v) for v in vocab]
+    super().set_vocabulary(vocab)
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@ -426,6 +426,21 @@ class IntegerLookupVocabularyTest(
    output_dataset = model.predict(input_array)
    self.assertAllEqual(expected_output, output_dataset)

+  def test_int_output_explicit_vocab_from_file_via_setter(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
  def test_non_unique_vocab_fails(self):
    vocab_data = [42, 1138, 725, 1729, 1729]
    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@ -212,3 +212,8 @@ class StringLookup(index_lookup.IndexLookup):
    # This is required because the MutableHashTable doesn't preserve insertion
    # order, but we rely on the order of the array to assign indices.
    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
+
+  def set_vocabulary(self, vocab):
+    if isinstance(vocab, str):
+      vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
+    super().set_vocabulary(vocab)
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@ -177,6 +177,22 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
    output_dataset = model.predict(input_array)
    self.assertAllEqual(expected_output, output_dataset)

+  def test_int_output_explicit_vocab_from_file_via_setter(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
  def test_non_unique_vocab_fails(self):
    vocab_data = ["earth", "wind", "and", "fire", "fire"]
    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@ -28,6 +28,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
@ -481,7 +482,8 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
    it.

    Arguments:
-      vocab: An array of string tokens.
+      vocab: An array of string tokens, or a path to a file containing one
+        token per line.
      df_data: An array of document frequency data. Only necessary if the layer
        output_mode is TFIDF.
      oov_df_value: The document frequency of the OOV token. Only necessary if
@ -506,6 +508,21 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
                          "be changed after the layer is "
                          "called.").format(mode=self._output_mode))

+    # Handle reading from a file. We can't do this via TF-IDF, as we don't have
+    # a standard format - we error out and ask our users to parse the file
+    # themselves.
+    if isinstance(vocab, str):
+      if self._output_mode == TFIDF:
+        raise RuntimeError("Setting vocabulary directly from a file is not "
+                           "supported in TF-IDF mode, since this layer cannot "
+                           "read files containing TF-IDF weight data. Please "
+                           "read the file using Python and set the vocab "
+                           "and weights by passing lists or arrays to the "
+                           "set_vocabulary function's `vocab` and `df_data` "
+                           "args.")
+      vocab = table_utils.get_vocabulary_from_file(
+          vocab, self._index_lookup_layer.encoding)
+
    self._index_lookup_layer.set_vocabulary(vocab)

    # When doing raw or integer output, we don't have a Vectorize layer to
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@ -44,6 +44,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test


@ -414,6 +415,15 @@ class TextVectorizationPreprocessingTest(
    keras_parameterized.TestCase,
    preprocessing_test_utils.PreprocessingLayerTest):

+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
  def test_summary_before_adapt(self):
    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
    layer = get_layer_class()(
@ -709,6 +719,46 @@ class TextVectorizationPreprocessingTest(
    output_dataset = model.predict(input_array)
    self.assertAllEqual(expected_output, output_dataset)

+  def test_vocab_setting_via_init_file(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_vocab_setting_via_setter(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT)
+    layer.set_vocabulary(vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+

@keras_parameterized.run_all_keras_modes
 class TextVectorizationDistributionTest(