Add options to lookup layers to handle vocabularies from files.

PiperOrigin-RevId: 338487147
Change-Id: I1274f5e300bfdc5b1e9d338c2946dc2ad74b76d7
This commit is contained in:
A. Unique TensorFlower 2020-10-22 09:23:52 -07:00 committed by TensorFlower Gardener
parent 4fd222fe53
commit 270db009cf
7 changed files with 112 additions and 1 deletions

View File

@ -293,6 +293,8 @@ This release contains contributions from many people at Google, as well as:
* Improvements to Keras preprocessing layers:
* TextVectorization can now accept a vocabulary list or file as an
init arg.
* TextVectorization, StringLookup, and IntegerLookup can now accept a
vocabulary file via the `set_vocab_from_file` method.
* Normalization can now accept mean and variance values as init args.
* In `Attention` and `AdditiveAttention` layers, the `call()` method now
accepts a `return_attention_scores` argument. When set to

View File

@ -217,3 +217,9 @@ class IntegerLookup(index_lookup.IndexLookup):
base_config["oov_value"] = base_config["oov_token"]
del base_config["oov_token"]
return base_config
def set_vocabulary(self, vocab):
if isinstance(vocab, str):
vocab = table_utils.get_vocabulary_from_file(vocab)
vocab = [int(v) for v in vocab]
super().set_vocabulary(vocab)

View File

@ -426,6 +426,21 @@ class IntegerLookupVocabularyTest(
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_int_output_explicit_vocab_from_file_via_setter(self):
vocab_list = [42, 1138, 725, 1729]
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()()
layer.set_vocabulary(vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_non_unique_vocab_fails(self):
vocab_data = [42, 1138, 725, 1729, 1729]
with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):

View File

@ -212,3 +212,8 @@ class StringLookup(index_lookup.IndexLookup):
# This is required because the MutableHashTable doesn't preserve insertion
# order, but we rely on the order of the array to assign indices.
return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
def set_vocabulary(self, vocab):
if isinstance(vocab, str):
vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
super().set_vocabulary(vocab)

View File

@ -177,6 +177,22 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_int_output_explicit_vocab_from_file_via_setter(self):
vocab_list = ["earth", "wind", "and", "fire"]
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()()
layer.set_vocabulary(vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_non_unique_vocab_fails(self):
vocab_data = ["earth", "wind", "and", "fire", "fire"]
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):

View File

@ -28,6 +28,7 @@ from tensorflow.python.keras import backend as K
from tensorflow.python.keras.engine import base_preprocessing_layer
from tensorflow.python.keras.layers.preprocessing import category_encoding
from tensorflow.python.keras.layers.preprocessing import string_lookup
from tensorflow.python.keras.layers.preprocessing import table_utils
from tensorflow.python.keras.utils import layer_utils
from tensorflow.python.keras.utils import tf_utils
from tensorflow.python.ops import array_ops
@ -481,7 +482,8 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
it.
Arguments:
vocab: An array of string tokens.
vocab: An array of string tokens, or a path to a file containing one
token per line.
df_data: An array of document frequency data. Only necessary if the layer
output_mode is TFIDF.
oov_df_value: The document frequency of the OOV token. Only necessary if
@ -506,6 +508,21 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
"be changed after the layer is "
"called.").format(mode=self._output_mode))
# Handle reading from a file. We can't do this via TF-IDF, as we don't have
# a standard format - we error out and ask our users to parse the file
# themselves.
if isinstance(vocab, str):
if self._output_mode == TFIDF:
raise RuntimeError("Setting vocabulary directly from a file is not "
"supported in TF-IDF mode, since this layer cannot "
"read files containing TF-IDF weight data. Please "
"read the file using Python and set the vocab "
"and weights by passing lists or arrays to the "
"set_vocabulary function's `vocab` and `df_data` "
"args.")
vocab = table_utils.get_vocabulary_from_file(
vocab, self._index_lookup_layer.encoding)
self._index_lookup_layer.set_vocabulary(vocab)
# When doing raw or integer output, we don't have a Vectorize layer to

View File

@ -44,6 +44,7 @@ from tensorflow.python.keras.utils import generic_utils
from tensorflow.python.ops import gen_string_ops
from tensorflow.python.ops.ragged import ragged_factory_ops
from tensorflow.python.ops.ragged import ragged_string_ops
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
@ -414,6 +415,15 @@ class TextVectorizationPreprocessingTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def _write_to_temp_file(self, file_name, vocab_list):
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
with gfile.GFile(vocab_path, "w") as writer:
for vocab in vocab_list:
writer.write(vocab + "\n")
writer.flush()
writer.close()
return vocab_path
def test_summary_before_adapt(self):
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
@ -709,6 +719,46 @@ class TextVectorizationPreprocessingTest(
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_vocab_setting_via_init_file(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=None,
standardize=None,
split=None,
output_mode=text_vectorization.INT,
vocabulary=vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_vocab_setting_via_setter(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=None,
standardize=None,
split=None,
output_mode=text_vectorization.INT)
layer.set_vocabulary(vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
@keras_parameterized.run_all_keras_modes
class TextVectorizationDistributionTest(