Allow IndexLookup to load vocab from a file.

PiperOrigin-RevId: 295348705
Change-Id: I4f2bb8e51bbeb94a916c57a0fc3423ea80b96956
This commit is contained in:
A. Unique TensorFlower 2020-02-15 10:46:59 -08:00 committed by TensorFlower Gardener
parent 1b8c5220b3
commit 6d00b470f5
6 changed files with 93 additions and 14 deletions
tensorflow
python/keras/layers/preprocessing
tools/pip_package

View File

@ -11,6 +11,14 @@ package(
exports_files(["LICENSE"])
filegroup(
name = "testdata",
srcs = [
"testdata/repeated_vocab.txt",
"testdata/vocab.txt",
],
)
py_library(
name = "preprocessing",
srcs = [
@ -276,6 +284,7 @@ tf_py_test(
name = "index_lookup_test",
size = "medium",
srcs = ["index_lookup_test.py"],
data = [":testdata"],
python_version = "PY3",
deps = [
":index_lookup",
@ -303,10 +312,9 @@ cuda_py_test(
)
tf_py_test(
name = "preprocessing_normalization_test",
name = "normalization_test",
size = "small",
srcs = ["normalization_test.py"],
main = "normalization_test.py",
python_version = "PY3",
deps = [
":normalization",
@ -317,10 +325,9 @@ tf_py_test(
)
tf_py_test(
name = "preprocessing_text_vectorization_test",
name = "text_vectorization_test",
size = "medium",
srcs = ["text_vectorization_test.py"],
main = "text_vectorization_test.py",
python_version = "PY3",
deps = [
":preprocessing_test_utils",

View File

@ -32,6 +32,7 @@ from tensorflow.python.ops import array_ops
from tensorflow.python.ops import lookup_ops
from tensorflow.python.ops.ragged import ragged_functional_ops
from tensorflow.python.ops.ragged import ragged_tensor
from tensorflow.python.platform import gfile
from tensorflow.python.util import compat
# The string tokens in the extracted vocabulary
@ -66,7 +67,13 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
1. If this value is more than 1, OOV inputs are hashed to determine their
OOV value; if this value is 0, passing an OOV input will result in a
runtime error.
vocabulary: An optional list of vocabulary terms.
vocabulary: An optional list of vocabulary terms, or a path to a text file
containing a vocabulary to load into this layer. The file should contain
one token per line. In either case, the vocabulary must be unique; if
the list or file contains the same token multiple times, an error will
be thrown. Note that when passing a vocabulary - either as a list or as
a file - the vocabulary will not be present in the layer's config dict;
it will instead be a part of the layer's weights.
reserve_zero: Whether to reserve the index 0, which indicates pad values in
the Keras masking system. If True, the output of this layer will be in the
range `[1...max_tokens+1)`; if False, the output will be in the range
@ -164,10 +171,38 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
self._inverse_table = None
if vocabulary is not None:
self._export_vocab = True
if isinstance(vocabulary, str):
vocabulary = self._get_vocabulary_from_file(vocabulary)
vocabulary_set = set(vocabulary)
if len(vocabulary) != len(vocabulary_set):
repeated_items = [
item for item, count in collections.Counter(vocabulary).items()
if count > 1
]
raise ValueError("The passed vocabulary has at least one repeated "
"term. Please uniquify your dataset before passing "
"it to IndexLookup(). The repeated terms are %s" %
repeated_items)
self.set_vocabulary(vocabulary)
else:
self._export_vocab = False
def _get_vocabulary_from_file(self, vocabulary_path):
vocab = []
with gfile.GFile(vocabulary_path, "r") as reader:
while True:
# Get the next line, and break if it is None.
text = reader.readline()
if not text:
break
# Convert the raw text into UTF8 and strip whitespace.
if isinstance(text, str):
token = text
elif isinstance(text, bytes):
token = text.decode("utf-8", "ignore")
token = token.strip()
vocab.append(token)
return vocab
def _get_table_data(self):
keys, values = self._table.export()
@ -256,11 +291,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
return [x for _, x in sorted(zip(values, keys))]
def get_config(self):
vocabulary = self.get_vocabulary() if self._export_vocab else None
config = {
"max_tokens": self.max_tokens,
"num_oov_tokens": self.num_oov_tokens,
"vocabulary": vocabulary,
"vocabulary": None,
"reserve_zero": self.reserve_zero,
"mask_zero": self.mask_zero,
}

View File

@ -37,6 +37,7 @@ from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
from tensorflow.python.ops.ragged import ragged_factory_ops
from tensorflow.python.platform import resource_loader
from tensorflow.python.platform import test
@ -355,7 +356,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_int_output_explicit_vocab_from_config(self):
@keras_parameterized.run_all_keras_modes
class IndexLookupVocabularyTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest
):
def test_int_output_explicit_vocab(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
@ -365,10 +372,20 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
layer = get_layer_class()(vocabulary=vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
with CustomObjectScope({"IndexLookup": get_layer_class()}):
new_model = keras.Model.from_config(model.get_config())
output_dataset = new_model.predict(input_array)
def test_int_output_explicit_vocab_from_file(self):
vocab_data = resource_loader.get_path_to_datafile("testdata/vocab.txt")
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_vocab_appending(self):
@ -386,6 +403,17 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_non_unique_vocab_fails(self):
vocab_data = ["earth", "wind", "and", "fire", "fire"]
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
_ = get_layer_class()(vocabulary=vocab_data)
def test_non_unique_vocab_from_file_fails(self):
vocab_data = resource_loader.get_path_to_datafile(
"testdata/repeated_vocab.txt")
with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
_ = get_layer_class()(vocabulary=vocab_data)
@keras_parameterized.run_all_keras_modes
class InverseLookupOutputTest(keras_parameterized.TestCase,

View File

@ -0,0 +1,5 @@
earth
wind
and
fire
earth

View File

@ -0,0 +1,4 @@
earth
wind
and
fire

View File

@ -83,6 +83,7 @@ DEPENDENCY_BLACKLIST = [
"//tensorflow/core:lmdb_testdata",
"//tensorflow/core/kernels/cloud:bigquery_reader_ops",
"//tensorflow/python/debug:grpc_tensorflow_server.par",
"//tensorflow/python/keras/layers/preprocessing:testdata",
"//tensorflow/python/feature_column:vocabulary_testdata",
"//tensorflow/python:framework/test_file_system.so",
"//tensorflow/python:util_nest_test_main_lib",