From 0685f70521018a653f22530c900a1253d4fa3963 Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Thu, 20 Feb 2020 16:11:36 -0800 Subject: [PATCH] Automated rollback of commit 0d2f3be5ebe4c762dddad2fe1bac1b4af538de2c PiperOrigin-RevId: 296320816 Change-Id: Ib8b5857178fa10513755de65ffcde1adf6dabad3 --- .../python/keras/layers/preprocessing/BUILD | 6 +-- .../layers/preprocessing/index_lookup.py | 46 +++++++++++++++--- .../layers/preprocessing/index_lookup_test.py | 47 +++++++++++++++++-- 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD index 720e92483fb..e0dd9114755 100644 --- a/tensorflow/python/keras/layers/preprocessing/BUILD +++ b/tensorflow/python/keras/layers/preprocessing/BUILD @@ -303,10 +303,9 @@ cuda_py_test( ) tf_py_test( - name = "preprocessing_normalization_test", + name = "normalization_test", size = "small", srcs = ["normalization_test.py"], - main = "normalization_test.py", python_version = "PY3", deps = [ ":normalization", @@ -317,10 +316,9 @@ tf_py_test( ) tf_py_test( - name = "preprocessing_text_vectorization_test", + name = "text_vectorization_test", size = "medium", srcs = ["text_vectorization_test.py"], - main = "text_vectorization_test.py", python_version = "PY3", deps = [ ":preprocessing_test_utils", diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py index 7bd7f6683d1..e8c2c0aefc6 100644 --- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py +++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import lookup_ops from tensorflow.python.ops.ragged import ragged_functional_ops from tensorflow.python.ops.ragged import ragged_tensor +from tensorflow.python.platform import gfile from tensorflow.python.util import compat # The string tokens in the extracted vocabulary @@ -66,7 +67,13 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): 1. If this value is more than 1, OOV inputs are hashed to determine their OOV value; if this value is 0, passing an OOV input will result in a runtime error. - vocabulary: An optional list of vocabulary terms. + vocabulary: An optional list of vocabulary terms, or a path to a text file + containing a vocabulary to load into this layer. The file should contain + one token per line. In either case, the vocabulary must be unique; if + the list or file contains the same token multiple times, an error will + be thrown. Note that when passing a vocabulary - either as a list or as + a file - the vocabulary will not be present in the layer's config dict; + it will instead be a part of the layer's weights. reserve_zero: Whether to reserve the index 0, which indicates pad values in the Keras masking system. If True, the output of this layer will be in the range `[1...max_tokens+1)`; if False, the output will be in the range @@ -164,10 +171,38 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): self._inverse_table = None if vocabulary is not None: - self._export_vocab = True + if isinstance(vocabulary, str): + vocabulary = self._get_vocabulary_from_file(vocabulary) + + vocabulary_set = set(vocabulary) + if len(vocabulary) != len(vocabulary_set): + repeated_items = [ + item for item, count in collections.Counter(vocabulary).items() + if count > 1 + ] + raise ValueError("The passed vocabulary has at least one repeated " + "term. Please uniquify your dataset before passing " + "it to IndexLookup(). The repeated terms are %s" % + repeated_items) self.set_vocabulary(vocabulary) - else: - self._export_vocab = False + + def _get_vocabulary_from_file(self, vocabulary_path): + vocab = [] + with gfile.GFile(vocabulary_path, "r") as reader: + while True: + # Get the next line, and break if it is None. + text = reader.readline() + if not text: + break + + # Convert the raw text into UTF8 and strip whitespace. + if isinstance(text, str): + token = text + elif isinstance(text, bytes): + token = text.decode("utf-8", "ignore") + token = token.strip() + vocab.append(token) + return vocab def _get_table_data(self): keys, values = self._table.export() @@ -256,11 +291,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): return [x for _, x in sorted(zip(values, keys))] def get_config(self): - vocabulary = self.get_vocabulary() if self._export_vocab else None config = { "max_tokens": self.max_tokens, "num_oov_tokens": self.num_oov_tokens, - "vocabulary": vocabulary, + "vocabulary": None, "reserve_zero": self.reserve_zero, "mask_zero": self.mask_zero, } diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py index fbb6062ce0b..96a7e7c547f 100644 --- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py +++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py @@ -38,6 +38,7 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util from tensorflow.python.keras.saving import save from tensorflow.python.keras.utils.generic_utils import CustomObjectScope from tensorflow.python.ops.ragged import ragged_factory_ops +from tensorflow.python.platform import gfile from tensorflow.python.platform import test @@ -356,7 +357,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset) - def test_int_output_explicit_vocab_from_config(self): + +@keras_parameterized.run_all_keras_modes +class IndexLookupVocabularyTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest + ): + + def _write_to_temp_file(self, file_name, vocab_list): + vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt") + with gfile.GFile(vocab_path, "w") as writer: + for vocab in vocab_list: + writer.write(vocab + "\n") + writer.flush() + writer.close() + return vocab_path + + def test_int_output_explicit_vocab(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], ["fire", "and", "earth", "michigan"]]) @@ -366,10 +382,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, layer = get_layer_class()(vocabulary=vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) - with CustomObjectScope({"IndexLookup": get_layer_class()}): - new_model = keras.Model.from_config(model.get_config()) - output_dataset = new_model.predict(input_array) + def test_int_output_explicit_vocab_from_file(self): + vocab_list = ["earth", "wind", "and", "fire"] + vocab_path = self._write_to_temp_file("vocab_file", vocab_list) + + input_array = np.array([["earth", "wind", "and", "fire"], + ["fire", "and", "earth", "michigan"]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.string) + layer = get_layer_class()(vocabulary=vocab_path) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset) def test_vocab_appending(self): @@ -387,6 +415,17 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, output_dataset = model.predict(input_array) self.assertAllClose(expected_output, output_dataset) + def test_non_unique_vocab_fails(self): + vocab_data = ["earth", "wind", "and", "fire", "fire"] + with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"): + _ = get_layer_class()(vocabulary=vocab_data) + + def test_non_unique_vocab_from_file_fails(self): + vocab_list = ["earth", "wind", "and", "fire", "earth"] + vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) + with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"): + _ = get_layer_class()(vocabulary=vocab_path) + @keras_parameterized.run_all_keras_modes class InverseLookupOutputTest(keras_parameterized.TestCase,