Add options to lookup layers to handle vocabularies from files.
PiperOrigin-RevId: 338487147 Change-Id: I1274f5e300bfdc5b1e9d338c2946dc2ad74b76d7
This commit is contained in:
parent
4fd222fe53
commit
270db009cf
RELEASE.md
tensorflow/python/keras/layers/preprocessing
@ -293,6 +293,8 @@ This release contains contributions from many people at Google, as well as:
|
||||
* Improvements to Keras preprocessing layers:
|
||||
* TextVectorization can now accept a vocabulary list or file as an
|
||||
init arg.
|
||||
* TextVectorization, StringLookup, and IntegerLookup can now accept a
|
||||
vocabulary file via the `set_vocab_from_file` method.
|
||||
* Normalization can now accept mean and variance values as init args.
|
||||
* In `Attention` and `AdditiveAttention` layers, the `call()` method now
|
||||
accepts a `return_attention_scores` argument. When set to
|
||||
|
@ -217,3 +217,9 @@ class IntegerLookup(index_lookup.IndexLookup):
|
||||
base_config["oov_value"] = base_config["oov_token"]
|
||||
del base_config["oov_token"]
|
||||
return base_config
|
||||
|
||||
def set_vocabulary(self, vocab):
|
||||
if isinstance(vocab, str):
|
||||
vocab = table_utils.get_vocabulary_from_file(vocab)
|
||||
vocab = [int(v) for v in vocab]
|
||||
super().set_vocabulary(vocab)
|
||||
|
@ -426,6 +426,21 @@ class IntegerLookupVocabularyTest(
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_int_output_explicit_vocab_from_file_via_setter(self):
|
||||
vocab_list = [42, 1138, 725, 1729]
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
|
||||
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()()
|
||||
layer.set_vocabulary(vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_non_unique_vocab_fails(self):
|
||||
vocab_data = [42, 1138, 725, 1729, 1729]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
|
||||
|
@ -212,3 +212,8 @@ class StringLookup(index_lookup.IndexLookup):
|
||||
# This is required because the MutableHashTable doesn't preserve insertion
|
||||
# order, but we rely on the order of the array to assign indices.
|
||||
return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
|
||||
|
||||
def set_vocabulary(self, vocab):
|
||||
if isinstance(vocab, str):
|
||||
vocab = table_utils.get_vocabulary_from_file(vocab, self.encoding)
|
||||
super().set_vocabulary(vocab)
|
||||
|
@ -177,6 +177,22 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_int_output_explicit_vocab_from_file_via_setter(self):
|
||||
vocab_list = ["earth", "wind", "and", "fire"]
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
|
||||
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()()
|
||||
layer.set_vocabulary(vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_non_unique_vocab_fails(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire", "fire"]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
|
||||
|
@ -28,6 +28,7 @@ from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras.engine import base_preprocessing_layer
|
||||
from tensorflow.python.keras.layers.preprocessing import category_encoding
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import table_utils
|
||||
from tensorflow.python.keras.utils import layer_utils
|
||||
from tensorflow.python.keras.utils import tf_utils
|
||||
from tensorflow.python.ops import array_ops
|
||||
@ -481,7 +482,8 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
it.
|
||||
|
||||
Arguments:
|
||||
vocab: An array of string tokens.
|
||||
vocab: An array of string tokens, or a path to a file containing one
|
||||
token per line.
|
||||
df_data: An array of document frequency data. Only necessary if the layer
|
||||
output_mode is TFIDF.
|
||||
oov_df_value: The document frequency of the OOV token. Only necessary if
|
||||
@ -506,6 +508,21 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
"be changed after the layer is "
|
||||
"called.").format(mode=self._output_mode))
|
||||
|
||||
# Handle reading from a file. We can't do this via TF-IDF, as we don't have
|
||||
# a standard format - we error out and ask our users to parse the file
|
||||
# themselves.
|
||||
if isinstance(vocab, str):
|
||||
if self._output_mode == TFIDF:
|
||||
raise RuntimeError("Setting vocabulary directly from a file is not "
|
||||
"supported in TF-IDF mode, since this layer cannot "
|
||||
"read files containing TF-IDF weight data. Please "
|
||||
"read the file using Python and set the vocab "
|
||||
"and weights by passing lists or arrays to the "
|
||||
"set_vocabulary function's `vocab` and `df_data` "
|
||||
"args.")
|
||||
vocab = table_utils.get_vocabulary_from_file(
|
||||
vocab, self._index_lookup_layer.encoding)
|
||||
|
||||
self._index_lookup_layer.set_vocabulary(vocab)
|
||||
|
||||
# When doing raw or integer output, we don't have a Vectorize layer to
|
||||
|
@ -44,6 +44,7 @@ from tensorflow.python.keras.utils import generic_utils
|
||||
from tensorflow.python.ops import gen_string_ops
|
||||
from tensorflow.python.ops.ragged import ragged_factory_ops
|
||||
from tensorflow.python.ops.ragged import ragged_string_ops
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
@ -414,6 +415,15 @@ class TextVectorizationPreprocessingTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def _write_to_temp_file(self, file_name, vocab_list):
|
||||
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
|
||||
with gfile.GFile(vocab_path, "w") as writer:
|
||||
for vocab in vocab_list:
|
||||
writer.write(vocab + "\n")
|
||||
writer.flush()
|
||||
writer.close()
|
||||
return vocab_path
|
||||
|
||||
def test_summary_before_adapt(self):
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
@ -709,6 +719,46 @@ class TextVectorizationPreprocessingTest(
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_vocab_setting_via_init_file(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.INT,
|
||||
vocabulary=vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_vocab_setting_via_setter(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.INT)
|
||||
layer.set_vocabulary(vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class TextVectorizationDistributionTest(
|
||||
|
Loading…
Reference in New Issue
Block a user