Split index_lookup into string_lookup and integer_lookup.

PiperOrigin-RevId: 311651579
Change-Id: Ie033727dbe1026a7c7a88e4b31653840a17ac3d1
This commit is contained in:
A. Unique TensorFlower 2020-05-14 19:07:07 -07:00 committed by TensorFlower Gardener
parent 0de7edf8b1
commit efa3fb28d9
16 changed files with 1658 additions and 439 deletions

View File

@ -27,10 +27,12 @@ py_library(
":discretization",
":hashing",
":image_preprocessing",
":integer_lookup",
":normalization",
":preprocessing_stage",
":preprocessing_test_utils",
":reduction",
":string_lookup",
":text_vectorization",
],
)
@ -146,6 +148,20 @@ py_library(
],
)
py_library(
name = "integer_lookup",
srcs = [
"integer_lookup.py",
"integer_lookup_v1.py",
],
srcs_version = "PY2AND3",
deps = [
":index_lookup",
":table_utils",
"//tensorflow/python:dtypes",
],
)
py_library(
name = "table_utils",
srcs = [
@ -179,7 +195,7 @@ py_library(
srcs_version = "PY2AND3",
deps = [
":categorical_encoding",
":index_lookup",
":string_lookup",
"//tensorflow/python:array_ops",
"//tensorflow/python:control_flow_ops",
"//tensorflow/python:dtypes",
@ -235,6 +251,20 @@ py_library(
],
)
py_library(
name = "string_lookup",
srcs = [
"string_lookup.py",
"string_lookup_v1.py",
],
srcs_version = "PY2AND3",
deps = [
":index_lookup",
":table_utils",
"//tensorflow/python:dtypes",
],
)
py_library(
name = "preprocessing_stage",
srcs = [
@ -442,6 +472,22 @@ tf_py_test(
],
)
tf_py_test(
name = "integer_lookup_test",
size = "medium",
srcs = ["integer_lookup_test.py"],
python_version = "PY3",
deps = [
":integer_lookup",
":preprocessing_test_utils",
"//tensorflow/python:client_testlib",
"//tensorflow/python/keras",
"//tensorflow/python/keras/utils:generic_utils",
"//tensorflow/python/ops/ragged:ragged_string_ops",
"@absl_py//absl/testing:parameterized",
],
)
distribute_py_test(
name = "normalization_distribution_test",
srcs = ["normalization_distribution_test.py"],
@ -517,6 +563,22 @@ tf_py_test(
],
)
tf_py_test(
name = "string_lookup_test",
size = "medium",
srcs = ["string_lookup_test.py"],
python_version = "PY3",
deps = [
":preprocessing_test_utils",
":string_lookup",
"//tensorflow/python:client_testlib",
"//tensorflow/python/keras",
"//tensorflow/python/keras/utils:generic_utils",
"//tensorflow/python/ops/ragged:ragged_string_ops",
"@absl_py//absl/testing:parameterized",
],
)
tf_py_test(
name = "preprocessing_stage_test",
srcs = ["preprocessing_stage_test.py"],

View File

@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
"""Maps strings (or integers) from a vocabulary to integer indices.
"""Maps values from a vocabulary to integer indices.
This layer translates a set of arbitrary strings or integers into an integer
output via a table-based lookup, with optional out-of-vocabulary handling.
This layer translates a set of arbitrary hashables into an integer output via
a table-based lookup, with optional out-of-vocabulary handling. This is the
basis layer for both IntegerLookup and IndexLookup; it holds the common
logic but is not intended to be exported as part of the Keras API.
If desired, the user can call this layer's `adapt()` method on a data set,
which will analyze the data set, determine the frequency of individual string
or integer values, and create a vocabulary from them. This vocabulary can have
values, and create a vocabulary from them. This vocabulary can have
unlimited size or be capped, depending on the configuration options for this
layer; if there are more unique values in the input than the maximum
vocabulary size, the most frequent terms will be used to create the
@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
Attributes:
max_tokens: The maximum size of the vocabulary for this layer. If None,
there is no cap on the size of the vocabulary. Note that the vocabulary
does include OOV buckets, so the effective number of unique values in the
vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
1. If this value is more than 1, OOV inputs are hashed to determine their
OOV value; if this value is 0, passing an OOV input will result in a '-1'
being returned for that value in the output tensor. (Note that, because
the value is -1 and not 0, this will allow you to effectively drop OOV
values from categorical encodings.)
vocabulary: An optional list of vocabulary terms, or a path to a text file
containing a vocabulary to load into this layer. The file should contain
one token per line. In either case, the vocabulary must be unique; if
the list or file contains the same token multiple times, an error will
be thrown. Note that when passing a vocabulary - either as a list or as
a file - the vocabulary will not be present in the layer's config dict;
it will instead be a part of the layer's weights.
reserve_zero: Whether to reserve the index 0, which indicates pad values in
the Keras masking system. If True, the output of this layer will be in the
range `[1...max_tokens+1)`; if False, the output will be in the range
`[0...max_tokens)`. Defaults to True.
mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
will be treated as masked values and assigned an output value of 0. If
this option is set, `reserve_zero` must also be set. Defaults to False.
Call arguments:
inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
invert: Controls the lookup direction. If False, the layer will map strings
to integers; if true, the layer will map integers to strings. Defaults
to False.
there is no cap on the size of the vocabulary. Note that this vocabulary
includes the OOV and mask tokens, so the effective number of tokens is
(max_tokens - num_oov_indices - (1 if mask_token else 0))
num_oov_indices: The number of out-of-vocabulary tokens to use. If this
value is more than 1, OOV inputs are hashed to determine their OOV value;
if this value is 0, passing an OOV input will result in a '-1' being
returned for that value in the output tensor. (Note that, because the
value is -1 and not 0, this will allow you to effectively drop OOV values
from categorical encodings.)
mask_token: A token that represents masked values, and which is mapped to
index 0. If set to None, no mask term will be added and the OOV tokens, if
any, will be indexed from (0...num_oov_indices) instead of
(1...num_oov_indices+1).
oov_token: The token representing an out-of-vocabulary value. This token is
only used when performing an inverse lookup.
vocabulary: An optional list of vocabulary terms. If the list contains the
same token multiple times, an error will be thrown.
"""
# TODO(momernick): Add an examples section to the docstring.
def __init__(self,
max_tokens=None,
num_oov_tokens=1,
max_tokens,
num_oov_indices,
mask_token,
oov_token,
vocabulary=None,
reserve_zero=True,
mask_zero=False,
**kwargs):
invert = False
if invert:
allowed_dtypes = [dtypes.int32, dtypes.int64]
else:
allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
raise ValueError("TextVectorization may only have a dtype in %s." %
allowed_dtypes)
if "dtype" not in kwargs:
kwargs["dtype"] = dtypes.int64 if invert else dtypes.string
# If max_tokens is set, the value must be greater than 1 - otherwise we
# are creating a 0-element vocab, which doesn't make sense.
if max_tokens is not None and max_tokens <= 1:
raise ValueError("If set, max_tokens must be greater than 1.")
if num_oov_tokens < 0:
raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
num_oov_tokens)
if num_oov_indices < 0:
raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
num_oov_indices)
self.invert = invert
self.max_tokens = max_tokens
self.num_oov_tokens = num_oov_tokens
self.reserve_zero = reserve_zero
self.mask_zero = mask_zero
# We need to reserve at least num_oov_tokens tokens, plus one additional
# value if we are reserving the zero value in our output.
if reserve_zero:
self._reserved_values = (num_oov_tokens + 1)
else:
self._reserved_values = num_oov_tokens
# We need to account for the OOV buckets in our vocabulary size.
if max_tokens is not None:
self._max_elements = max_tokens - num_oov_tokens
else:
self._max_elements = None
self.num_oov_indices = num_oov_indices
self.oov_token = oov_token
self.mask_token = mask_token
# If there is only one OOV bucket, we can determine the OOV value (either 0
# or 1 depending on whether 0 is reserved) and set that as the default
@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
# do a further hashing step; to make this easier, we set the OOV value to
# -1. (This lets us do a vectorized add and cast to boolean to determine
# locations where we need to do extra hashing.)
if self.num_oov_tokens == 1:
self._oov_value = 1 if reserve_zero else 0
if self.num_oov_indices == 1:
self._oov_value = 0 if mask_token is None else 1
else:
self._oov_value = -1
super(IndexLookup, self).__init__(
combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
**kwargs)
self._output_dtype = dtypes.int64
# If the layer's input type is int32, we can only output int32 values -
# MutableHashTable doesn't allow us to map int32->int64.
if self.dtype == dtypes.int32:
self._output_dtype = dtypes.int32
else:
self._output_dtype = dtypes.int64
self._table = lookup_ops.MutableHashTable(
key_dtype=self.dtype,
value_dtype=self._output_dtype,
@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
# counting code in the Model object doesn't throw an attribute error.
tracked_table.shape = tensor_shape.TensorShape((0,))
if self.num_oov_tokens <= 1:
oov_tokens = None
if self.num_oov_indices <= 1:
oov_indices = None
else:
oov_start = 1 if reserve_zero else 0
oov_tokens = list(range(oov_start, self._reserved_values))
oov_start = 1 if mask_token is not None else 0
oov_end = oov_start + num_oov_indices
oov_indices = list(range(oov_start, oov_end))
self._table_handler = table_utils.TableHandler(
table=self._table,
oov_tokens=oov_tokens,
oov_tokens=oov_indices,
use_v1_apis=self._use_v1_apis())
if vocabulary is not None:
if isinstance(vocabulary, str):
vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
table_utils.validate_vocabulary_is_unique(vocabulary)
self.set_vocabulary(vocabulary)
def compute_output_shape(self, input_shape):
return input_shape
def compute_output_signature(self, input_spec, invert=False):
def compute_output_signature(self, input_spec):
output_shape = self.compute_output_shape(input_spec.shape.as_list())
if invert:
output_dtype = dtypes.string
else:
output_dtype = dtypes.int64
output_dtype = dtypes.int64
return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
def adapt(self, data, reset_state=True):
@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
keys, values = self._table_handler.data()
# This is required because the MutableHashTable doesn't preserve insertion
# order, but we rely on the order of the array to assign indices.
if self.dtype == dtypes.string:
return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
else:
return [x for _, x in sorted(zip(values, keys))]
return [x for _, x in sorted(zip(values, keys))]
def vocab_size(self):
return self._table_handler.vocab_size()
@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
def get_config(self):
config = {
"max_tokens": self.max_tokens,
"num_oov_tokens": self.num_oov_tokens,
"vocabulary": None,
"reserve_zero": self.reserve_zero,
"mask_zero": self.mask_zero,
"num_oov_indices": self.num_oov_indices,
"oov_token": self.oov_token,
"mask_token": self.mask_token,
}
base_config = super(IndexLookup, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
# abstraction for ease of saving!) we return 0.
return 0
def set_vocabulary(self,
vocab,
append=False):
def set_vocabulary(self, vocab):
"""Sets vocabulary (and optionally document frequency) data for this layer.
This method sets the vocabulary for this layer directly, instead of
analyzing a dataset through 'adapt'. It should be used whenever the vocab
information is already known. If vocabulary data is already present in the
layer, this method will either replace it, if 'append' is set to False, or
append to it (if 'append' is set to True).
layer, this method will either replace it
Arguments:
vocab: An array of string tokens.
append: Whether to overwrite or append any existing vocabulary data.
Raises:
ValueError: If there are too many inputs, the inputs do not match, or
input data is missing.
"""
current_table_size = self._table_handler.vocab_size()
total_vocab_size = len(vocab) + (current_table_size if append else 0)
if self.max_tokens is not None and total_vocab_size > self._max_elements:
table_utils.validate_vocabulary_is_unique(vocab)
should_have_mask = self.mask_token is not None
if should_have_mask:
has_mask = vocab[0] == self.mask_token
oov_start = 1
else:
has_mask = False
oov_start = 0
should_have_oov = self.num_oov_indices > 0
if should_have_oov:
oov_end = oov_start + self.num_oov_indices
expected_oov = [self.oov_token] * self.num_oov_indices
has_oov = vocab[oov_start:oov_end] == expected_oov
# If we get a numpy array, then has_oov may end up being a numpy array
# instead of a bool. Fix this by collapsing the variable if it's not bool.
if not isinstance(has_oov, bool):
has_oov = any(has_oov)
else:
has_oov = False
if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
raise ValueError("The passed vocabulary has the correct mask token `%s` "
"at index 0, but does not have the OOV token `%s` in "
"indices [%s:%s]. Instead, we found `%s`. Was this "
"vocabulary generated by a layer with incompatible "
"settings?" %
(self.mask_token, self.oov_token, oov_start, oov_end,
vocab[oov_start:oov_end]))
if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
raise ValueError(
"The passed vocabulary has the correct OOV token `%s` at "
"indices [%s:%s], but does not have the mask token `%s` in "
"index 0. Instead, we found `%s`. Was this vocabulary "
"generated by a layer with incompatible settings?" %
(self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
insert_special_tokens = not has_oov and not has_mask
special_tokens = [] if self.mask_token is None else [self.mask_token]
special_tokens.extend([self.oov_token] * self.num_oov_indices)
num_special_tokens = len(special_tokens)
tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
if self.mask_token in tokens:
raise ValueError("Reserved mask token %s was found in the passed "
"vocabulary at index %s. Please either remove the "
"reserved token from the vocabulary or change the "
"mask token for this layer." %
(self.mask_token, tokens.index(self.mask_token)))
if self.oov_token in tokens:
raise ValueError("Reserved OOV token %s was found in the passed "
"vocabulary at index %s. Please either remove the "
"reserved token from the vocabulary or change the "
"OOV token for this layer." %
(self.oov_token, tokens.index(self.oov_token)))
if insert_special_tokens:
total_vocab_size = len(vocab) + num_special_tokens
else:
total_vocab_size = len(vocab)
if self.max_tokens is not None and total_vocab_size > self.max_tokens:
raise ValueError(
"Attempted to set a vocabulary larger than the maximum vocab size. "
"Passed vocab size is %s, max vocab size is %s. Note that the OOV "
"token(s) are automatically added to the number of tokens." %
"Passed vocab size is %s, max vocab size is %s." %
(total_vocab_size, self.max_tokens))
start_index = self._reserved_values + (current_table_size if append else 0)
start_index = num_special_tokens
values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
table_utils.assert_same_type(self.dtype, vocab, "vocab")
values = table_utils.convert_to_ndarray(values, self._output_dtype)
table_utils.assert_same_type(self._output_dtype, values, "values")
if not append and current_table_size > 0:
self._table_handler.clear()
self._table_handler.clear()
self._table_handler.insert(vocab, values)
if insert_special_tokens and num_special_tokens > 0:
special_token_values = np.arange(num_special_tokens, dtype=np.int64)
self._table_handler.insert(special_tokens, special_token_values)
def _set_state_variables(self, updates):
if not self.built:
raise RuntimeError("_set_state_variables() must be called after build().")
@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
dataset, all tokens are retained.s
"""
def __init__(self, vocab_size=None):
def __init__(self, vocab_size=None, mask_value=None):
self._vocab_size = vocab_size
self._mask_value = mask_value
def compute(self, values, accumulator=None):
"""Compute a step in this computation, returning a new accumulator."""
values = base_preprocessing_layer.convert_to_list(values)
values = base_preprocessing_layer.convert_to_list(
values, sparse_default_value=self._mask_value)
if accumulator is None:
accumulator = self._create_accumulator()
# TODO(momernick): Benchmark improvements to this algorithm.
if isinstance(values, (str, bytes)):
if isinstance(values, (str, bytes, np.int64)):
accumulator.count_dict[values] += 1
else:
for document in values:
@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
"vocab": A list of the retained items in the vocabulary.
"""
vocab_counts = accumulator.count_dict
if self._mask_value in vocab_counts:
del vocab_counts[self._mask_value]
sorted_counts = sorted(
vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
vocab_data = (

View File

@ -65,7 +65,12 @@ class IndexLookupDistributionTest(
with distribution.scope():
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()()
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.adapt(vocab_dataset)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)

View File

@ -21,7 +21,6 @@ from __future__ import print_function
import itertools
import os
import random
import six
import string
from absl.testing import parameterized
@ -31,7 +30,6 @@ from tensorflow.python import keras
from tensorflow.python import tf2
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.distribute import one_device_strategy
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor
@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
from tensorflow.python.keras.saving import save
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
from tensorflow.python.ops.ragged import ragged_factory_ops
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
@ -71,6 +68,10 @@ def _get_end_to_end_test_cases():
["and"], ["earth"], ["michigan"]]),
"kwargs": {
"max_tokens": None,
"num_oov_indices": 1,
"mask_token": "",
"oov_token": "[OOV]",
"dtype": dtypes.string,
},
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
"input_dtype":
@ -91,6 +92,9 @@ def _get_end_to_end_test_cases():
dtype=np.int64),
"kwargs": {
"max_tokens": None,
"num_oov_indices": 1,
"mask_token": 0,
"oov_token": -1,
"dtype": dtypes.int64,
},
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@ -172,7 +176,12 @@ class CategoricalEncodingInputTest(
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -193,7 +202,12 @@ class CategoricalEncodingInputTest(
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
layer = get_layer_class()(
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=1,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -209,7 +223,12 @@ class CategoricalEncodingInputTest(
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -223,7 +242,12 @@ class CategoricalEncodingInputTest(
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
layer = get_layer_class()(
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=1,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest(
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=2,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest(
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(
max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=2,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest(
expected_output = [[3, 4, 6], [6, 5, 3, 2]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=2,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest(
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(
max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=2,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest(
dense_shape=[3, 4])
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.adapt(vocab_dataset)
# Note that the expected vocabulary has a null string (''). This is because
# we assume that sparse tensors are in fact dense tensors with elided
# values, not ragged tensors. Therefore, we assume that any missing data
# is important and give it a spot in our vocab.
expected_vocabulary = ["", "michigan", "fire"]
expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_ragged_adapt(self):
@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest(
["fire", "michigan"]])
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.adapt(vocab_dataset)
expected_vocabulary = ["michigan", "fire"]
expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_sparse_int_input(self):
@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest(
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
layer = get_layer_class()(
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=1,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest(
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest(
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
layer = get_layer_class()(
max_tokens=None,
dtype=dtypes.int64,
num_oov_indices=1,
mask_token=0,
oov_token=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest(
batched_ds = ds.take(2)
input_t = keras.Input(shape=(), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=10, num_oov_tokens=0, reserve_zero=False)
max_tokens=10,
num_oov_indices=0,
mask_token=None,
oov_token=None,
dtype=dtypes.string)
_ = layer(input_t)
layer.adapt(batched_ds)
@keras_parameterized.run_all_keras_modes
class IndexLookupDistributionTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_cpu_distribution(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
with strategy.scope():
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()()
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
@keras_parameterized.run_all_keras_modes
class IndexLookupOutputTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()()
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
def test_output_shape(self):
input_data = keras.Input(shape=(4,), dtype=dtypes.string)
layer = get_layer_class()()
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
int_data = layer(input_data)
self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(reserve_zero=False)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=None,
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_data)
layer = get_layer_class()(
vocabulary=vocab_data,
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest
):
def _write_to_temp_file(self, file_name, vocab_list):
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
with gfile.GFile(vocab_path, "w") as writer:
for vocab in vocab_list:
writer.write(vocab + "\n")
writer.flush()
writer.close()
return vocab_path
def test_int_output_explicit_vocab(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_data)
layer = get_layer_class()(
vocabulary=vocab_data,
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_get_vocab_returns_str(self):
vocab_data = ["earth", "wind", "and", "fire"]
layer = get_layer_class()(vocabulary=vocab_data)
layer_vocab = layer.get_vocabulary()
self.assertAllEqual(vocab_data, layer_vocab)
self.assertIsInstance(layer_vocab[0], six.text_type)
def test_vocab_with_max_cap(self):
vocab_data = ["", "[OOV]", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=5,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
returned_vocab = layer.get_vocabulary()
self.assertAllEqual(vocab_data, returned_vocab)
def test_int_output_explicit_vocab_from_file(self):
vocab_list = ["earth", "wind", "and", "fire"]
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_vocab_appending(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=5)
layer.set_vocabulary(vocab_data[0])
layer.set_vocabulary(vocab_data[1], append=True)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_int_vocab_with_max_cap(self):
vocab_data = [0, -1, 42, 1276, 1138]
layer = get_layer_class()(
max_tokens=5,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
layer.set_vocabulary(vocab_data)
returned_vocab = layer.get_vocabulary()
self.assertAllEqual(vocab_data, returned_vocab)
def test_non_unique_vocab_fails(self):
vocab_data = ["earth", "wind", "and", "fire", "fire"]
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
_ = get_layer_class()(vocabulary=vocab_data)
_ = get_layer_class()(
vocabulary=vocab_data,
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
def test_non_unique_vocab_from_file_fails(self):
vocab_list = ["earth", "wind", "and", "fire", "earth"]
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
def test_vocab_with_oov_and_wrong_mask_fails(self):
vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
layer.set_vocabulary(vocab_data)
def test_vocab_with_oov_and_no_mask_fails(self):
vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
layer.set_vocabulary(vocab_data)
def test_vocab_with_mask_but_no_oov_fails(self):
vocab_data = ["", "earth", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
layer.set_vocabulary(vocab_data)
def test_vocab_with_repeated_element_fails(self):
vocab_data = ["earth", "earth", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
_ = get_layer_class()(vocabulary=vocab_path)
layer.set_vocabulary(vocab_data)
def test_vocab_with_reserved_oov_element_fails(self):
vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
layer.set_vocabulary(vocab_data)
@keras_parameterized.run_all_keras_modes
class InverseLookupOutputTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_vocab_with_reserved_mask_element_fails(self):
vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="mask_token",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
layer.set_vocabulary(vocab_data)
def DISABLE_test_inverse_output(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
# Note that the token 'michigan' has been replaced by ''. This is because
# 'michigan' is OOV for this layer.
expected_strings = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", ""]])
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
string_data = layer(int_data, invert=True)
model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
int_outputs, string_outputs = model.predict(input_array)
self.assertAllEqual(expected_ints, int_outputs)
self.assertAllEqual(expected_strings, string_outputs)
def test_non_unique_int_vocab_fails(self):
vocab_data = [12, 13, 14, 15, 15]
with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
_ = get_layer_class()(
vocabulary=vocab_data,
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
def DISABLE_test_inverse_output_serialization(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
# Note that the token 'michigan' has been replaced by ''. This is because
# 'michigan' is OOV for this layer.
expected_strings = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", ""]])
def test_int_vocab_with_oov_and_wrong_mask_fails(self):
vocab_data = [1234, -1, 11, 21, 13, 14]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
layer.set_vocabulary(vocab_data)
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
string_data = layer(int_data, invert=True)
model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
def test_int_vocab_with_oov_and_no_mask_fails(self):
vocab_data = [-1, 11, 12, 13, 14]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
layer.set_vocabulary(vocab_data)
with CustomObjectScope({"IndexLookup": get_layer_class()}):
new_model = keras.Model.from_config(model.get_config())
new_model.set_weights(model.get_weights())
int_outputs, string_outputs = new_model.predict(input_array)
self.assertAllEqual(expected_ints, int_outputs)
self.assertAllEqual(expected_strings, string_outputs)
def test_int_vocab_with_mask_but_no_oov_fails(self):
vocab_data = [0, 11, 12, 13, 14]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
layer.set_vocabulary(vocab_data)
def test_int_vocab_with_repeated_element_fails(self):
vocab_data = [11, 11, 34, 23, 124]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
layer.set_vocabulary(vocab_data)
def test_int_vocab_with_reserved_oov_element_fails(self):
vocab_data = [14, 38, -1, 34, 3, 84]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
layer.set_vocabulary(vocab_data)
def test_int_vocab_with_reserved_mask_element_fails(self):
vocab_data = [125, 0, 3, 4, 94]
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token=0,
oov_token=-1,
dtype=dtypes.int64)
with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
layer.set_vocabulary(vocab_data)
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
vocab_data = ["earth", "wind", "and", "fire"]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=10)
layer = get_layer_class()(
max_tokens=10,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
vocab_data = ["earth", "wind", "and", "fire"]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=10)
layer = get_layer_class()(
max_tokens=10,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
def test_too_long_vocab_fails_in_single_setting(self):
vocab_data = ["earth", "wind", "and", "fire"]
layer = get_layer_class()(max_tokens=4)
layer = get_layer_class()(
max_tokens=4,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
with self.assertRaisesRegex(ValueError,
"vocabulary larger than the maximum vocab.*"):
layer.set_vocabulary(vocab_data)
def test_too_long_vocab_fails_in_multiple_settings(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
layer = get_layer_class()(max_tokens=4)
# The first time we call set_vocabulary, we're under the max_tokens
# so it should be fine.
layer.set_vocabulary(vocab_data[0])
with self.assertRaisesRegex(ValueError,
"vocabulary larger than the maximum vocab.*"):
layer.set_vocabulary(vocab_data[1], append=True)
def test_zero_max_tokens_fails(self):
with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
_ = get_layer_class()(max_tokens=0)
_ = get_layer_class()(
max_tokens=0,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
@keras_parameterized.run_all_keras_modes
@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
# Build and validate a golden model.
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=None)
layer = get_layer_class()(
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
dtype=dtypes.string)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
@keras_parameterized.run_all_keras_modes
class IndexLookupCombinerTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
class IndexLookupStringCombinerTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def compare_text_accumulators(self, a, b, msg=None):
if a is None or b is None:
@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
self.validate_accumulator_extract(combiner, data, expected_extract_output)
@keras_parameterized.run_all_keras_modes
class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest
):
def compare_text_accumulators(self, a, b, msg=None):
if a is None or b is None:
self.assertAllEqual(a, b, msg=msg)
self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
compare_accumulators = compare_text_accumulators
def update_accumulator(self, accumulator, data):
accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
return accumulator
def test_combiner_api_compatibility_int_mode(self):
data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
combiner = index_lookup._IndexLookupCombiner()
expected_accumulator_output = {
"vocab": np.array([1138, 725, 42, 1729, 203]),
"counts": np.array([2, 2, 2, 1, 1]),
}
expected_extract_output = {
"vocab": np.array([1138, 725, 42, 1729, 203]),
}
expected_accumulator = combiner._create_accumulator()
expected_accumulator = self.update_accumulator(expected_accumulator,
expected_accumulator_output)
self.validate_accumulator_serialize_and_deserialize(combiner, data,
expected_accumulator)
self.validate_accumulator_uniqueness(combiner, data)
self.validate_accumulator_extract(combiner, data, expected_extract_output)
# TODO(askerryryan): Add tests confirming equivalence to behavior of
# existing tf.keras.preprocessing.text.Tokenizer.
@parameterized.named_parameters(
{
"testcase_name": "top_k_smaller_than_full_vocab",
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
"vocab_size": 3,
"expected_accumulator_output": {
"vocab": np.array([1138, 1729, 725, 42]),
"counts": np.array([3, 2, 1, 1]),
},
"expected_extract_output": {
"vocab": np.array([1138, 1729, 725]),
},
},
{
"testcase_name": "top_k_larger_than_full_vocab",
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
"vocab_size": 10,
"expected_accumulator_output": {
"vocab": np.array([1138, 1729, 725, 42]),
"counts": np.array([3, 2, 1, 1]),
},
"expected_extract_output": {
"vocab": np.array([1138, 1729, 725, 42]),
},
},
{
"testcase_name": "no_top_k",
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
"vocab_size": None,
"expected_accumulator_output": {
"vocab": np.array([1138, 1729, 725, 42]),
"counts": np.array([3, 2, 1, 1]),
},
"expected_extract_output": {
"vocab": np.array([1138, 1729, 725, 42]),
},
},
{
"testcase_name": "single_element_per_row",
"data": np.array([[42], [1138], [1729], [1138], [725]]),
"vocab_size": 3,
"expected_accumulator_output": {
"vocab": np.array([1138, 1729, 725, 42]),
"counts": np.array([2, 1, 1, 1]),
},
"expected_extract_output": {
"vocab": np.array([1138, 1729, 725]),
},
},
# Which tokens are retained are based on global frequency, and thus are
# sensitive to frequency within a document. In contrast, because idf only
# considers the presence of a token in a document, it is insensitive
# to the frequency of the token within the document.
{
"testcase_name":
"retained_tokens_sensitive_to_within_document_frequency",
"data":
np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
[725, 203]]),
"vocab_size":
3,
"expected_accumulator_output": {
"vocab": np.array([1138, 42, 1729, 725, 203]),
"counts": np.array([4, 2, 2, 1, 1]),
},
"expected_extract_output": {
"vocab": np.array([1138, 1729, 42]),
},
})
def test_combiner_computation(self, data, vocab_size,
expected_accumulator_output,
expected_extract_output):
combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
expected_accumulator = combiner._create_accumulator()
expected_accumulator = self.update_accumulator(expected_accumulator,
expected_accumulator_output)
self.validate_accumulator_computation(combiner, data, expected_accumulator)
self.validate_accumulator_extract(combiner, data, expected_extract_output)
if __name__ == "__main__":
test.main()

View File

@ -0,0 +1,112 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras string lookup preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.framework import dtypes
from tensorflow.python.keras.layers.preprocessing import index_lookup
from tensorflow.python.keras.layers.preprocessing import table_utils
class IntegerLookup(index_lookup.IndexLookup):
"""Maps integers from a vocabulary to integer indices.
This layer translates a set of arbitrary integers into an integer output via a
table-based lookup, with optional out-of-vocabulary handling.
If desired, the user can call this layer's `adapt()` method on a data set,
which will analyze the data set, determine the frequency of individual string
values, and create a vocabulary from them. This vocabulary can have
unlimited size or be capped, depending on the configuration options for this
layer; if there are more unique values in the input than the maximum
vocabulary size, the most frequent terms will be used to create the
vocabulary.
Attributes:
max_values: The maximum size of the vocabulary for this layer. If None,
there is no cap on the size of the vocabulary. Note that this vocabulary
includes the OOV and mask tokens, so the effective number of tokens is
(max_tokens - num_oov_tokens - (1 if mask_token else 0))
num_oov_indices: The number of out-of-vocabulary values to use; defaults to
1. If this value is more than 1, OOV inputs are hashed to determine their
OOV value; if this value is 0, passing an OOV input will result in a '-1'
being returned for that value in the output tensor. (Note that, because
the value is -1 and not 0, this will allow you to effectively drop OOV
values from categorical encodings.)
mask_value: A value that represents masked inputs, and which is mapped to
index 0. Defaults to 0. If set to None, no mask term will be added and the
OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
(1...num_oov_tokens+1).
oov_value: The value representing an out-of-vocabulary value. Defaults to
-1.
vocabulary: An optional list of values, or a path to a text file containing
a vocabulary to load into this layer. The file should contain one value
per line. If the list or file contains the same token multiple times, an
error will be thrown.
"""
def __init__(self,
max_values=None,
num_oov_indices=1,
mask_value=0,
oov_value=-1,
vocabulary=None,
**kwargs):
allowed_dtypes = [dtypes.int64]
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
raise ValueError("IntegerLookup may only have a dtype in %s." %
allowed_dtypes)
if "dtype" not in kwargs:
kwargs["dtype"] = dtypes.int64
# If max_values is set, the value must be greater than 1 - otherwise we
# are creating a 0-element vocab, which doesn't make sense.
if max_values is not None and max_values <= 1:
raise ValueError("If set, max_values must be greater than 1.")
if num_oov_indices < 0:
raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
num_oov_indices)
if vocabulary is not None:
if isinstance(vocabulary, str):
vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
vocabulary = [int(v) for v in vocabulary]
super(IntegerLookup, self).__init__(
max_tokens=max_values,
num_oov_indices=num_oov_indices,
mask_token=mask_value,
oov_token=oov_value,
vocabulary=vocabulary,
**kwargs)
def get_config(self):
base_config = super(IntegerLookup, self).get_config()
# Because the super config has a bunch of args we're also passing,
# we need to rename and remove them from the config dict.
base_config["max_values"] = base_config["max_tokens"]
del base_config["max_tokens"]
base_config["mask_value"] = base_config["mask_token"]
del base_config["mask_token"]
base_config["oov_value"] = base_config["oov_token"]
del base_config["oov_token"]
return base_config

View File

@ -0,0 +1,501 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras text vectorization preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools
import os
import random
from absl.testing import parameterized
import numpy as np
from tensorflow.python import keras
from tensorflow.python import tf2
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import sparse_tensor
from tensorflow.python.framework import tensor_shape
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.layers.preprocessing import integer_lookup
from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
from tensorflow.python.keras.saving import save
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
from tensorflow.python.ops.ragged import ragged_factory_ops
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
def get_layer_class():
if context.executing_eagerly():
return integer_lookup.IntegerLookup
else:
return integer_lookup_v1.IntegerLookup
def _get_end_to_end_test_cases():
test_cases = (
{
"testcase_name":
"test_ints_soft_vocab_cap",
# Create an array where 1138 is the most frequent term, followed by
# 1729, then 725, then 42. This ensures that the vocab accumulator
# is sorting by frequency.
"vocab_data":
np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
[1729], [725], [725]],
dtype=np.int64),
"input_data":
np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
dtype=np.int64),
"kwargs": {
"max_values": None,
"dtype": dtypes.int64,
},
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
"input_dtype":
dtypes.int64
},)
crossed_test_cases = []
# Cross above test cases with use_dataset in (True, False)
for use_dataset in (True, False):
for case in test_cases:
case = case.copy()
if use_dataset:
case["testcase_name"] = case["testcase_name"] + "_with_dataset"
case["use_dataset"] = use_dataset
crossed_test_cases.append(case)
return crossed_test_cases
@keras_parameterized.run_all_keras_modes
class IntegerLookupLayerTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
@parameterized.named_parameters(*_get_end_to_end_test_cases())
def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
use_dataset, expected_output,
input_dtype):
cls = get_layer_class()
expected_output_dtype = dtypes.int64
input_shape = input_data.shape
if use_dataset:
# Keras APIs expect batched datasets.
# TODO(rachelim): `model.predict` predicts the result on each
# dataset batch separately, then tries to concatenate the results
# together. When the results have different shapes on the non-concat
# axis (which can happen in the output_mode = INT case for
# IntegerLookup), the concatenation fails. In real use cases, this may
# not be an issue because users are likely to pipe the preprocessing layer
# into other keras layers instead of predicting it directly. A workaround
# for these unit tests is to have the dataset only contain one batch, so
# no concatenation needs to happen with the result. For consistency with
# numpy input, we should make `predict` join differently shaped results
# together sensibly, with 0 padding.
input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
input_shape[0])
vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
input_shape[0])
with CustomObjectScope({"IntegerLookup": cls}):
output_data = testing_utils.layer_test(
cls,
kwargs=kwargs,
input_shape=input_shape,
input_data=input_data,
input_dtype=input_dtype,
expected_output_dtype=expected_output_dtype,
validate_training=False,
adapt_data=vocab_data)
self.assertAllClose(expected_output, output_data)
@keras_parameterized.run_all_keras_modes
class CategoricalEncodingInputTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_sparse_int_input(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = sparse_tensor.SparseTensor(
indices=[[0, 0], [1, 2]],
values=np.array([13, 32], dtype=np.int64),
dense_shape=[3, 4])
expected_indices = [[0, 0], [1, 2]]
expected_values = [5, 1]
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(max_values=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_data = model.predict(input_array, steps=1)
self.assertAllEqual(expected_indices, output_data.indices)
self.assertAllEqual(expected_values, output_data.values)
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
def test_ragged_int_input(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
dtype=np.int64)
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(max_values=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
@keras_parameterized.run_all_keras_modes
class CategoricalEncodingMultiOOVTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_sparse_int_input_multi_bucket(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = sparse_tensor.SparseTensor(
indices=[[0, 0], [1, 2]],
values=np.array([13, 133], dtype=np.int64),
dense_shape=[3, 4])
expected_indices = [[0, 0], [1, 2]]
expected_values = [6, 2]
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(
max_values=None,
dtype=dtypes.int64,
num_oov_indices=2,
mask_value=0,
oov_value=-1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_data = model.predict(input_array, steps=1)
self.assertAllEqual(expected_indices, output_data.indices)
self.assertAllEqual(expected_values, output_data.values)
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
def test_ragged_int_input_multi_bucket(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
dtype=np.int64)
expected_output = [[3, 4, 6], [6, 5, 3, 2]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(max_values=None, num_oov_indices=2)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
@keras_parameterized.run_all_keras_modes
class CategoricalEncodingAdaptTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_sparse_adapt(self):
vocab_data = sparse_tensor.SparseTensor(
indices=[[0, 0], [0, 1], [1, 2]],
values=[203, 1729, 203],
dense_shape=[3, 4])
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
layer = get_layer_class()()
layer.adapt(vocab_dataset)
expected_vocabulary = [0, -1, 203, 1729]
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_ragged_adapt(self):
vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
layer = get_layer_class()()
layer.adapt(vocab_dataset)
expected_vocabulary = [0, -1, 203, 1729]
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
def test_sparse_int_input(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = sparse_tensor.SparseTensor(
indices=[[0, 0], [1, 2]],
values=np.array([13, 32], dtype=np.int64),
dense_shape=[3, 4])
expected_indices = [[0, 0], [1, 2]]
expected_values = [5, 1]
expected_dense_shape = [3, 4]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
layer = get_layer_class()(max_values=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_data = model.predict(input_array, steps=1)
self.assertAllEqual(expected_indices, output_data.indices)
self.assertAllEqual(expected_values, output_data.values)
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
def test_ragged_int_input(self):
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
dtype=np.int64)
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
layer = get_layer_class()(max_values=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_single_int_generator_dataset(self):
def word_gen():
for _ in itertools.count(1):
yield random.randint(0, 100)
ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
tensor_shape.TensorShape([]))
batched_ds = ds.take(2)
input_t = keras.Input(shape=(), dtype=dtypes.int64)
layer = get_layer_class()(
max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
_ = layer(input_t)
layer.adapt(batched_ds)
@keras_parameterized.run_all_keras_modes
class IntegerLookupOutputTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_int_output(self):
vocab_data = [42, 1138, 725, 1729]
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()()
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_output_shape(self):
input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
layer = get_layer_class()(max_values=None, num_oov_indices=1)
int_data = layer(input_data)
self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
def test_int_output_no_reserved_zero(self):
vocab_data = [42, 1138, 725, 1729]
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(max_values=None, mask_value=None)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_int_output_explicit_vocab(self):
vocab_data = [42, 1138, 725, 1729]
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(
vocabulary=vocab_data,
max_values=None,
)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
@keras_parameterized.run_all_keras_modes
class IntegerLookupVocabularyTest(
keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def _write_to_temp_file(self, file_name, vocab_list):
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
with gfile.GFile(vocab_path, "w") as writer:
for vocab in vocab_list:
writer.write(str(vocab) + "\n")
writer.flush()
writer.close()
return vocab_path
def test_int_output_explicit_vocab(self):
vocab_data = [42, 1138, 725, 1729]
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(vocabulary=vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_get_vocab_returns_int(self):
vocab_data = [42, 1138, 725, 1729]
expected_vocab = [0, -1, 42, 1138, 725, 1729]
layer = get_layer_class()(vocabulary=vocab_data)
layer_vocab = layer.get_vocabulary()
self.assertAllEqual(expected_vocab, layer_vocab)
self.assertIsInstance(layer_vocab[0], np.int64)
def test_int_output_explicit_vocab_from_file(self):
vocab_list = [42, 1138, 725, 1729]
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(vocabulary=vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_non_unique_vocab_fails(self):
vocab_data = [42, 1138, 725, 1729, 1729]
with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
_ = get_layer_class()(vocabulary=vocab_data)
def test_non_unique_vocab_from_file_fails(self):
vocab_list = [42, 1138, 725, 1729, 42]
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
_ = get_layer_class()(vocabulary=vocab_path)
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
class IntegerLookupSaveableTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest
):
def test_ops_are_not_added_with_multiple_get_set_weights(self):
vocab_data = [42, 1138, 725, 1729]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(max_values=10)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
weights = model.get_weights()
model.set_weights(weights)
keras.backend.get_session().graph.finalize()
weights = model.get_weights()
model.set_weights(weights)
def test_layer_saving_with_h5(self):
vocab_data = [42, 1138, 725, 1729]
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(max_values=10)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
path = os.path.join(self.get_temp_dir(), "model")
with self.assertRaisesRegex(NotImplementedError,
"Save or restore weights that is not.*"):
save.save_model(model, path, save_format="h5")
@keras_parameterized.run_all_keras_modes
class IntegerLookupErrorTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_too_long_vocab_fails_in_single_setting(self):
vocab_data = [42, 1138, 725, 1729]
layer = get_layer_class()(max_values=4, num_oov_indices=1)
with self.assertRaisesRegex(ValueError,
"vocabulary larger than the maximum vocab.*"):
layer.set_vocabulary(vocab_data)
def test_zero_max_values_fails(self):
with self.assertRaisesRegex(ValueError, ".*max_values.*"):
_ = get_layer_class()(max_values=0, num_oov_indices=1)
@keras_parameterized.run_all_keras_modes
class IntegerLookupSavingTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_vocabulary_persistence_across_saving(self):
vocab_data = [42, 1138, 725, 1729]
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
# Build and validate a golden model.
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
layer = get_layer_class()(max_values=None, num_oov_indices=1)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(output_dataset, expected_output)
# Save the model to disk.
output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
model.save(output_path, save_format="tf")
# Delete the session and graph to ensure that the loaded model is generated
# from scratch.
# TODO(b/149526183): Can't clear session when TF2 is disabled.
if tf2.enabled():
keras.backend.clear_session()
loaded_model = keras.models.load_model(
output_path, custom_objects={"IntegerLookup": get_layer_class()})
# Ensure that the loaded model is unique (so that the save/load is real)
self.assertIsNot(model, loaded_model)
# Validate correctness of the new model.
new_output_dataset = loaded_model.predict(input_array)
self.assertAllEqual(new_output_dataset, expected_output)
if __name__ == "__main__":
test.main()

View File

@ -0,0 +1,25 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras string lookup preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
from tensorflow.python.keras.layers.preprocessing import integer_lookup
class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
pass

View File

@ -0,0 +1,106 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras string lookup preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.framework import dtypes
from tensorflow.python.keras.layers.preprocessing import index_lookup
from tensorflow.python.keras.layers.preprocessing import table_utils
class StringLookup(index_lookup.IndexLookup):
"""Maps strings from a vocabulary to integer indices.
This layer translates a set of arbitrary strings into an integer output via a
table-based lookup, with optional out-of-vocabulary handling.
If desired, the user can call this layer's `adapt()` method on a data set,
which will analyze the data set, determine the frequency of individual string
values, and create a vocabulary from them. This vocabulary can have
unlimited size or be capped, depending on the configuration options for this
layer; if there are more unique values in the input than the maximum
vocabulary size, the most frequent terms will be used to create the
vocabulary.
Attributes:
max_tokens: The maximum size of the vocabulary for this layer. If None,
there is no cap on the size of the vocabulary. Note that this vocabulary
includes the OOV and mask tokens, so the effective number of tokens is
(max_tokens - num_oov_indices - (1 if mask_token else 0))
num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
1. If this value is more than 1, OOV inputs are hashed to determine their
OOV value; if this value is 0, passing an OOV input will result in a '-1'
being returned for that value in the output tensor. (Note that, because
the value is -1 and not 0, this will allow you to effectively drop OOV
values from categorical encodings.)
mask_token: A token that represents masked values, and which is mapped to
index 0. Defaults to the empty string "". If set to None, no mask term
will be added and the OOV tokens, if any, will be indexed from
(0...num_oov_indices) instead of (1...num_oov_indices+1).
oov_token: The token representing an out-of-vocabulary value. Defaults to
"[OOV]".
vocabulary: An optional list of vocabulary terms, or a path to a text file
containing a vocabulary to load into this layer. The file should contain
one token per line. If the list or file contains the same token multiple
times, an error will be thrown.
encoding: The Python string encoding to use. Defaults to `'utf-8'`.
"""
def __init__(self,
max_tokens=None,
num_oov_indices=1,
mask_token="",
oov_token="[OOV]",
vocabulary=None,
encoding="utf-8",
**kwargs):
allowed_dtypes = [dtypes.string]
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
raise ValueError("StringLookup may only have a dtype in %s." %
allowed_dtypes)
if "dtype" not in kwargs:
kwargs["dtype"] = dtypes.string
if vocabulary is not None:
if isinstance(vocabulary, str):
vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
self.encoding = encoding
super(StringLookup, self).__init__(
max_tokens=max_tokens,
num_oov_indices=num_oov_indices,
mask_token=mask_token,
oov_token=oov_token,
vocabulary=vocabulary,
**kwargs)
def get_config(self):
config = {"encoding": self.encoding}
base_config = super(StringLookup, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def get_vocabulary(self):
if self._table_handler.vocab_size() == 0:
return []
keys, values = self._table_handler.data()
# This is required because the MutableHashTable doesn't preserve insertion
# order, but we rely on the order of the array to assign indices.
return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]

View File

@ -0,0 +1,224 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Keras text vectorization preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl.testing import parameterized
import numpy as np
import six
from tensorflow.python import keras
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.keras import keras_parameterized
from tensorflow.python.keras import testing_utils
from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
from tensorflow.python.keras.layers.preprocessing import string_lookup
from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
from tensorflow.python.keras.saving import save
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
from tensorflow.python.platform import gfile
from tensorflow.python.platform import test
def get_layer_class():
if context.executing_eagerly():
return string_lookup.StringLookup
else:
return string_lookup_v1.StringLookup
def _get_end_to_end_test_cases():
test_cases = (
{
"testcase_name":
"test_strings_soft_vocab_cap",
# Create an array where 'earth' is the most frequent term, followed by
# 'wind', then 'and', then 'fire'. This ensures that the vocab
# accumulator is sorting by frequency.
"vocab_data":
np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
"input_data":
np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
["and"], ["earth"], ["michigan"]]),
"kwargs": {
"max_tokens": None,
},
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
"input_dtype":
dtypes.string
},
)
crossed_test_cases = []
# Cross above test cases with use_dataset in (True, False)
for use_dataset in (True, False):
for case in test_cases:
case = case.copy()
if use_dataset:
case["testcase_name"] = case["testcase_name"] + "_with_dataset"
case["use_dataset"] = use_dataset
crossed_test_cases.append(case)
return crossed_test_cases
@keras_parameterized.run_all_keras_modes
class StringLookupLayerTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
@parameterized.named_parameters(*_get_end_to_end_test_cases())
def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
use_dataset, expected_output,
input_dtype):
cls = get_layer_class()
expected_output_dtype = dtypes.int64
input_shape = input_data.shape
if use_dataset:
# Keras APIs expect batched datasets.
# TODO(rachelim): `model.predict` predicts the result on each
# dataset batch separately, then tries to concatenate the results
# together. When the results have different shapes on the non-concat
# axis (which can happen in the output_mode = INT case for
# StringLookup), the concatenation fails. In real use cases, this may
# not be an issue because users are likely to pipe the preprocessing layer
# into other keras layers instead of predicting it directly. A workaround
# for these unit tests is to have the dataset only contain one batch, so
# no concatenation needs to happen with the result. For consistency with
# numpy input, we should make `predict` join differently shaped results
# together sensibly, with 0 padding.
input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
input_shape[0])
vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
input_shape[0])
with CustomObjectScope({"StringLookup": cls}):
output_data = testing_utils.layer_test(
cls,
kwargs=kwargs,
input_shape=input_shape,
input_data=input_data,
input_dtype=input_dtype,
expected_output_dtype=expected_output_dtype,
validate_training=False,
adapt_data=vocab_data)
self.assertAllClose(expected_output, output_data)
@keras_parameterized.run_all_keras_modes
class StringLookupVocabularyTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest
):
def _write_to_temp_file(self, file_name, vocab_list):
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
with gfile.GFile(vocab_path, "w") as writer:
for vocab in vocab_list:
writer.write(vocab + "\n")
writer.flush()
writer.close()
return vocab_path
def test_int_output_explicit_vocab(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_get_vocab_returns_str(self):
vocab_data = ["earth", "wind", "and", "fire"]
expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
layer = get_layer_class()(vocabulary=vocab_data)
layer_vocab = layer.get_vocabulary()
self.assertAllEqual(expected_vocab, layer_vocab)
self.assertIsInstance(layer_vocab[0], six.text_type)
def test_int_output_explicit_vocab_from_file(self):
vocab_list = ["earth", "wind", "and", "fire"]
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(vocabulary=vocab_path)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_non_unique_vocab_fails(self):
vocab_data = ["earth", "wind", "and", "fire", "fire"]
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
_ = get_layer_class()(vocabulary=vocab_data)
def test_non_unique_vocab_from_file_fails(self):
vocab_list = ["earth", "wind", "and", "fire", "earth"]
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
_ = get_layer_class()(vocabulary=vocab_path)
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
class StringLookupSaveableTest(keras_parameterized.TestCase,
preprocessing_test_utils.PreprocessingLayerTest):
def test_ops_are_not_added_with_multiple_get_set_weights(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=10)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
weights = model.get_weights()
model.set_weights(weights)
keras.backend.get_session().graph.finalize()
weights = model.get_weights()
model.set_weights(weights)
def test_layer_saving_with_h5(self):
vocab_data = ["earth", "wind", "and", "fire"]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(max_tokens=10)
layer.set_vocabulary(vocab_data)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
path = os.path.join(self.get_temp_dir(), "model")
with self.assertRaisesRegex(NotImplementedError,
"Save or restore weights that is not.*"):
save.save_model(model, path, save_format="h5")
if __name__ == "__main__":
test.main()

View File

@ -0,0 +1,25 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras string lookup preprocessing layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
from tensorflow.python.keras.layers.preprocessing import string_lookup
class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
pass

View File

@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None):
if np.can_cast(array.dtype, np_dtype):
array = array.astype(np_dtype, casting="safe")
return array

View File

@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K
from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
from tensorflow.python.keras.layers.preprocessing import categorical_encoding
from tensorflow.python.keras.layers.preprocessing import index_lookup
from tensorflow.python.keras.layers.preprocessing import string_lookup
from tensorflow.python.keras.utils import layer_utils
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import control_flow_ops
@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer):
self._max_tokens = max_tokens
# In INT mode, we have two reserved values (PAD and OOV). However, non-INT
# modes don't have a PAD value, so we only need to reserve one value.
self._reserved_values = 2 if output_mode == INT else 1
# In INT mode, the zero value is reserved for padding (per Keras standard
# padding approaches). In non-INT modes, there is no padding so we can set
# the OOV value to zero instead of one.
@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer):
self._max_vocab_size, compute_idf=output_mode == TFIDF),
**kwargs)
reserve_zero = output_mode in [None, INT]
mask_token = "" if output_mode in [None, INT] else None
self._index_lookup_layer = self._get_index_lookup_class()(
max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
max_tokens=max_tokens, mask_token=mask_token)
# If this layer is configured for string or integer output, we do not
# create a vectorization layer (as the output is not vectorized).
@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer):
return (keys.numpy(), values.numpy())
def _get_index_lookup_class(self):
return index_lookup.IndexLookup
return string_lookup.StringLookup
def _to_numpy(self, preprocessed_data):
"""Converts preprocessed inputs into numpy arrays."""
@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer):
def set_vocabulary(self,
vocab,
df_data=None,
oov_df_value=None,
append=False):
oov_df_value=None):
"""Sets vocabulary (and optionally document frequency) data for this layer.
This method sets the vocabulary and DF data for this layer directly, instead
of analyzing a dataset through 'adapt'. It should be used whenever the vocab
(and optionally document frequency) information is already known. If
vocabulary data is already present in the layer, this method will either
replace it, if 'append' is set to False, or append to it (if 'append' is set
to True).
vocabulary data is already present in the layer, this method will replace
it.
Arguments:
vocab: An array of string tokens.
df_data: An array of document frequency data. Only necessary if the layer
output_mode is TFIDF.
oov_df_value: The document frequency of the OOV token. Only necessary if
output_mode is TFIDF. OOV data is optional when appending additional
data in TFIDF mode; if an OOV value is supplied it will overwrite the
existing OOV value.
append: Whether to overwrite or append any existing vocabulary data.
output_mode is TFIDF.
Raises:
ValueError: If there are too many inputs, the inputs do not match, or
@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer):
"be changed after the layer is "
"called.").format(mode=self._output_mode))
current_table_size = self._index_lookup_layer.vocab_size()
self._index_lookup_layer.set_vocabulary(vocab, append)
self._index_lookup_layer.set_vocabulary(vocab)
# When doing raw or integer output, we don't have a Vectorize layer to
# manage. In this case, we can return directly.
@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer):
return
if not self._pad_to_max or self._max_tokens is None:
num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
num_tokens = self._index_lookup_layer.vocab_size()
self._vectorize_layer.set_num_elements(num_tokens)
# We're only _really_ appending if the table_size is nonzero. This is
# important for some sanity checks in tfidf mode (specifically, checking if
# oov_df_value is set or not) and handling existing tfidf weight data.
append = append if current_table_size > 0 else False
if self._output_mode == TFIDF:
if df_data is None:
raise ValueError("df_data must be set if output_mode is TFIDF")
@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer):
raise ValueError("df_data must be the same length as vocab. "
"len(df_data) is %s, len(vocab) is %s" %
(len(vocab), len(df_data)))
if not append and oov_df_value is None:
raise ValueError("You must pass an oov_df_value the first time "
"'set_vocabulary' is called when output_mode is "
if oov_df_value is None:
raise ValueError("You must pass an oov_df_value when output_mode is "
"TFIDF.")
df_data = self._convert_to_ndarray(df_data)
if append:
# The existing IDF data is stored in a Keras weight, so we can get it
# by calling K.get_value() on the weight object. Take the first
# table_size+1 values in case we're padding the weight with zeros
existing_df_data = K.get_value(
self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
df_data = np.append(existing_df_data, df_data, axis=0)
# If we are appending and need to replace the OOV DF value, we can
# assign it over the existing OOV DF value at index 0 of the (already-
# concatenated) DF value array.
if oov_df_value is not None:
df_data[0] = oov_df_value
else:
# If we are not appending (that is, we have only new data) we need to
# insert the OOV value to the front of the array. (This is a append to
# the head, not a replacement of the zeroth value.)
if not isinstance(oov_df_value, np.ndarray):
oov_df_value = np.array([oov_df_value])
df_data = np.insert(df_data, 0, oov_df_value)
if not isinstance(oov_df_value, np.ndarray):
oov_df_value = np.array([oov_df_value])
df_data = np.insert(df_data, 0, oov_df_value)
self._vectorize_layer.set_tfidf_data(df_data)
def build(self, input_shape):
@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
if not self.built:
raise RuntimeError("_set_state_variables() must be called after build().")
if self._output_mode == TFIDF:
self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
updates[_OOV_IDF_NAME])
self.set_vocabulary(
updates[_VOCAB_NAME],
updates[_IDF_NAME],
updates[_OOV_IDF_NAME])
else:
self.set_vocabulary(updates[_VOCAB_NAME])

View File

@ -619,25 +619,6 @@ class TextVectorizationOutputTest(
output_dataset = model.predict(input_array)
self.assertAllEqual(expected_output, output_dataset)
def test_vocab_appending(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
input_array = np.array([["earth", "wind", "and", "fire"],
["fire", "and", "earth", "michigan"]])
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=5,
standardize=None,
split=None,
output_mode=text_vectorization.INT)
layer.set_vocabulary(vocab_data[0])
layer.set_vocabulary(vocab_data[1], append=True)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_int_output_densifies_with_zeros(self):
vocab_data = ["earth", "wind", "and", "fire"]
# Create an input array that has 5 elements in the first example and 4 in
@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest(
split=None,
output_mode=text_vectorization.TFIDF,
pad_to_max_tokens=True)
layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
layer.set_vocabulary(
vocab_data,
df_data=tfidf_data,
oov_df_value=.05)
int_data = layer(input_data)
self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest(
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_tfidf_appending(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
tfidf_data = [[.5, .25], [.2, .125]]
input_array = np.array([["earth", "wind", "and", "earth"],
["ohio", "fire", "earth", "michigan"]])
# pyformat: disable
# pylint: disable=bad-whitespace
expected_output = [[ 0, 1, .25, .2, 0],
[.1, .5, 0, 0, .125]]
# pylint: enable=bad-whitespace
# pyformat: enable
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=5,
standardize=None,
split=None,
output_mode=text_vectorization.TFIDF)
layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_tfidf_appending_with_oov_replacement(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
tfidf_data = [[.5, .25], [.2, .125]]
input_array = np.array([["earth", "wind", "and", "earth"],
["ohio", "fire", "earth", "michigan"]])
# pyformat: disable
# pylint: disable=bad-whitespace
expected_output = [[ 0, 1, .25, .2, 0],
[1.5, .5, 0, 0, .125]]
# pylint: enable=bad-whitespace
# pyformat: enable
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
layer = get_layer_class()(
max_tokens=5,
standardize=None,
split=None,
output_mode=text_vectorization.TFIDF)
layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
# Note that here we've replaced the OOV vaue.
layer.set_vocabulary(
vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
int_data = layer(input_data)
model = keras.Model(inputs=input_data, outputs=int_data)
output_dataset = model.predict(input_array)
self.assertAllClose(expected_output, output_dataset)
def test_accept_1D_input(self):
input_array = np.array(["earth wind and fire",
"fire and earth michigan"])
@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
"vocabulary larger than the maximum vocab.*"):
layer.set_vocabulary(vocab_data)
def test_too_long_vocab_fails_in_multiple_settings(self):
vocab_data = [["earth", "wind"], ["and", "fire"]]
layer = get_layer_class()(
max_tokens=4,
standardize=None,
split=None,
output_mode=text_vectorization.INT)
# The first time we call set_vocabulary, we're under the max_tokens limit
# so it should be fine.
layer.set_vocabulary(vocab_data[0])
with self.assertRaisesRegex(ValueError,
"vocabulary larger than the maximum vocab.*"):
layer.set_vocabulary(vocab_data[1], append=True)
def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
vocab_data = ["earth", "wind", "and", "fire"]
@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
"You must pass an oov_df_value.*"):
layer.set_vocabulary(vocab_data, df_data)
def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
vocab_data = ["earth", "wind", "and", "fire"]
df_data = [1, 2, 3, 4]
layer = get_layer_class()(
max_tokens=5,
standardize=None,
split=None,
output_mode=text_vectorization.TFIDF)
with self.assertRaisesRegex(ValueError,
"You must pass an oov_df_value.*"):
layer.set_vocabulary(vocab_data, df_data, append=True)
def test_set_tfidf_in_non_tfidf_fails(self):
vocab_data = ["earth", "wind", "and", "fire"]
df_data = [1, 2, 3, 4]

View File

@ -23,7 +23,7 @@ import numpy as np
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.engine import base_preprocessing_layer_v1
from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
from tensorflow.python.keras.layers.preprocessing import text_vectorization
from tensorflow.python.ops.ragged import ragged_tensor_value
from tensorflow.python.util.tf_export import keras_export
@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
return categorical_encoding_v1.CategoricalEncoding
def _get_index_lookup_class(self):
return index_lookup_v1.IndexLookup
return string_lookup_v1.StringLookup
def _to_numpy(self, data):
"""Converts preprocessed inputs into numpy arrays."""

View File

@ -221,7 +221,7 @@ tf_class {
}
member_method {
name: "set_vocabulary"
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
}
member_method {
name: "set_weights"

View File

@ -219,7 +219,7 @@ tf_class {
}
member_method {
name: "set_vocabulary"
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
}
member_method {
name: "set_weights"