Split index_lookup into string_lookup and integer_lookup.
PiperOrigin-RevId: 311651579 Change-Id: Ie033727dbe1026a7c7a88e4b31653840a17ac3d1
This commit is contained in:
parent
0de7edf8b1
commit
efa3fb28d9
@ -27,10 +27,12 @@ py_library(
|
||||
":discretization",
|
||||
":hashing",
|
||||
":image_preprocessing",
|
||||
":integer_lookup",
|
||||
":normalization",
|
||||
":preprocessing_stage",
|
||||
":preprocessing_test_utils",
|
||||
":reduction",
|
||||
":string_lookup",
|
||||
":text_vectorization",
|
||||
],
|
||||
)
|
||||
@ -146,6 +148,20 @@ py_library(
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "integer_lookup",
|
||||
srcs = [
|
||||
"integer_lookup.py",
|
||||
"integer_lookup_v1.py",
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":index_lookup",
|
||||
":table_utils",
|
||||
"//tensorflow/python:dtypes",
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "table_utils",
|
||||
srcs = [
|
||||
@ -179,7 +195,7 @@ py_library(
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":categorical_encoding",
|
||||
":index_lookup",
|
||||
":string_lookup",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:control_flow_ops",
|
||||
"//tensorflow/python:dtypes",
|
||||
@ -235,6 +251,20 @@ py_library(
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "string_lookup",
|
||||
srcs = [
|
||||
"string_lookup.py",
|
||||
"string_lookup_v1.py",
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":index_lookup",
|
||||
":table_utils",
|
||||
"//tensorflow/python:dtypes",
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "preprocessing_stage",
|
||||
srcs = [
|
||||
@ -442,6 +472,22 @@ tf_py_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "integer_lookup_test",
|
||||
size = "medium",
|
||||
srcs = ["integer_lookup_test.py"],
|
||||
python_version = "PY3",
|
||||
deps = [
|
||||
":integer_lookup",
|
||||
":preprocessing_test_utils",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python/keras",
|
||||
"//tensorflow/python/keras/utils:generic_utils",
|
||||
"//tensorflow/python/ops/ragged:ragged_string_ops",
|
||||
"@absl_py//absl/testing:parameterized",
|
||||
],
|
||||
)
|
||||
|
||||
distribute_py_test(
|
||||
name = "normalization_distribution_test",
|
||||
srcs = ["normalization_distribution_test.py"],
|
||||
@ -517,6 +563,22 @@ tf_py_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "string_lookup_test",
|
||||
size = "medium",
|
||||
srcs = ["string_lookup_test.py"],
|
||||
python_version = "PY3",
|
||||
deps = [
|
||||
":preprocessing_test_utils",
|
||||
":string_lookup",
|
||||
"//tensorflow/python:client_testlib",
|
||||
"//tensorflow/python/keras",
|
||||
"//tensorflow/python/keras/utils:generic_utils",
|
||||
"//tensorflow/python/ops/ragged:ragged_string_ops",
|
||||
"@absl_py//absl/testing:parameterized",
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "preprocessing_stage_test",
|
||||
srcs = ["preprocessing_stage_test.py"],
|
||||
|
@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
|
||||
|
||||
|
||||
class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
"""Maps strings (or integers) from a vocabulary to integer indices.
|
||||
"""Maps values from a vocabulary to integer indices.
|
||||
|
||||
This layer translates a set of arbitrary strings or integers into an integer
|
||||
output via a table-based lookup, with optional out-of-vocabulary handling.
|
||||
This layer translates a set of arbitrary hashables into an integer output via
|
||||
a table-based lookup, with optional out-of-vocabulary handling. This is the
|
||||
basis layer for both IntegerLookup and IndexLookup; it holds the common
|
||||
logic but is not intended to be exported as part of the Keras API.
|
||||
|
||||
If desired, the user can call this layer's `adapt()` method on a data set,
|
||||
which will analyze the data set, determine the frequency of individual string
|
||||
or integer values, and create a vocabulary from them. This vocabulary can have
|
||||
values, and create a vocabulary from them. This vocabulary can have
|
||||
unlimited size or be capped, depending on the configuration options for this
|
||||
layer; if there are more unique values in the input than the maximum
|
||||
vocabulary size, the most frequent terms will be used to create the
|
||||
@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
|
||||
Attributes:
|
||||
max_tokens: The maximum size of the vocabulary for this layer. If None,
|
||||
there is no cap on the size of the vocabulary. Note that the vocabulary
|
||||
does include OOV buckets, so the effective number of unique values in the
|
||||
vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
|
||||
num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
|
||||
1. If this value is more than 1, OOV inputs are hashed to determine their
|
||||
OOV value; if this value is 0, passing an OOV input will result in a '-1'
|
||||
being returned for that value in the output tensor. (Note that, because
|
||||
the value is -1 and not 0, this will allow you to effectively drop OOV
|
||||
values from categorical encodings.)
|
||||
vocabulary: An optional list of vocabulary terms, or a path to a text file
|
||||
containing a vocabulary to load into this layer. The file should contain
|
||||
one token per line. In either case, the vocabulary must be unique; if
|
||||
the list or file contains the same token multiple times, an error will
|
||||
be thrown. Note that when passing a vocabulary - either as a list or as
|
||||
a file - the vocabulary will not be present in the layer's config dict;
|
||||
it will instead be a part of the layer's weights.
|
||||
reserve_zero: Whether to reserve the index 0, which indicates pad values in
|
||||
the Keras masking system. If True, the output of this layer will be in the
|
||||
range `[1...max_tokens+1)`; if False, the output will be in the range
|
||||
`[0...max_tokens)`. Defaults to True.
|
||||
mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
|
||||
will be treated as masked values and assigned an output value of 0. If
|
||||
this option is set, `reserve_zero` must also be set. Defaults to False.
|
||||
Call arguments:
|
||||
inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
|
||||
invert: Controls the lookup direction. If False, the layer will map strings
|
||||
to integers; if true, the layer will map integers to strings. Defaults
|
||||
to False.
|
||||
there is no cap on the size of the vocabulary. Note that this vocabulary
|
||||
includes the OOV and mask tokens, so the effective number of tokens is
|
||||
(max_tokens - num_oov_indices - (1 if mask_token else 0))
|
||||
num_oov_indices: The number of out-of-vocabulary tokens to use. If this
|
||||
value is more than 1, OOV inputs are hashed to determine their OOV value;
|
||||
if this value is 0, passing an OOV input will result in a '-1' being
|
||||
returned for that value in the output tensor. (Note that, because the
|
||||
value is -1 and not 0, this will allow you to effectively drop OOV values
|
||||
from categorical encodings.)
|
||||
mask_token: A token that represents masked values, and which is mapped to
|
||||
index 0. If set to None, no mask term will be added and the OOV tokens, if
|
||||
any, will be indexed from (0...num_oov_indices) instead of
|
||||
(1...num_oov_indices+1).
|
||||
oov_token: The token representing an out-of-vocabulary value. This token is
|
||||
only used when performing an inverse lookup.
|
||||
vocabulary: An optional list of vocabulary terms. If the list contains the
|
||||
same token multiple times, an error will be thrown.
|
||||
"""
|
||||
# TODO(momernick): Add an examples section to the docstring.
|
||||
|
||||
def __init__(self,
|
||||
max_tokens=None,
|
||||
num_oov_tokens=1,
|
||||
max_tokens,
|
||||
num_oov_indices,
|
||||
mask_token,
|
||||
oov_token,
|
||||
vocabulary=None,
|
||||
reserve_zero=True,
|
||||
mask_zero=False,
|
||||
**kwargs):
|
||||
invert = False
|
||||
if invert:
|
||||
allowed_dtypes = [dtypes.int32, dtypes.int64]
|
||||
else:
|
||||
allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
|
||||
|
||||
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
|
||||
raise ValueError("TextVectorization may only have a dtype in %s." %
|
||||
allowed_dtypes)
|
||||
|
||||
if "dtype" not in kwargs:
|
||||
kwargs["dtype"] = dtypes.int64 if invert else dtypes.string
|
||||
|
||||
# If max_tokens is set, the value must be greater than 1 - otherwise we
|
||||
# are creating a 0-element vocab, which doesn't make sense.
|
||||
if max_tokens is not None and max_tokens <= 1:
|
||||
raise ValueError("If set, max_tokens must be greater than 1.")
|
||||
|
||||
if num_oov_tokens < 0:
|
||||
raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
|
||||
num_oov_tokens)
|
||||
if num_oov_indices < 0:
|
||||
raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
|
||||
num_oov_indices)
|
||||
|
||||
self.invert = invert
|
||||
self.max_tokens = max_tokens
|
||||
self.num_oov_tokens = num_oov_tokens
|
||||
self.reserve_zero = reserve_zero
|
||||
self.mask_zero = mask_zero
|
||||
|
||||
# We need to reserve at least num_oov_tokens tokens, plus one additional
|
||||
# value if we are reserving the zero value in our output.
|
||||
if reserve_zero:
|
||||
self._reserved_values = (num_oov_tokens + 1)
|
||||
else:
|
||||
self._reserved_values = num_oov_tokens
|
||||
|
||||
# We need to account for the OOV buckets in our vocabulary size.
|
||||
if max_tokens is not None:
|
||||
self._max_elements = max_tokens - num_oov_tokens
|
||||
else:
|
||||
self._max_elements = None
|
||||
self.num_oov_indices = num_oov_indices
|
||||
self.oov_token = oov_token
|
||||
self.mask_token = mask_token
|
||||
|
||||
# If there is only one OOV bucket, we can determine the OOV value (either 0
|
||||
# or 1 depending on whether 0 is reserved) and set that as the default
|
||||
@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
# do a further hashing step; to make this easier, we set the OOV value to
|
||||
# -1. (This lets us do a vectorized add and cast to boolean to determine
|
||||
# locations where we need to do extra hashing.)
|
||||
if self.num_oov_tokens == 1:
|
||||
self._oov_value = 1 if reserve_zero else 0
|
||||
if self.num_oov_indices == 1:
|
||||
self._oov_value = 0 if mask_token is None else 1
|
||||
else:
|
||||
self._oov_value = -1
|
||||
|
||||
super(IndexLookup, self).__init__(
|
||||
combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
|
||||
combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
|
||||
**kwargs)
|
||||
|
||||
self._output_dtype = dtypes.int64
|
||||
|
||||
# If the layer's input type is int32, we can only output int32 values -
|
||||
# MutableHashTable doesn't allow us to map int32->int64.
|
||||
if self.dtype == dtypes.int32:
|
||||
self._output_dtype = dtypes.int32
|
||||
else:
|
||||
self._output_dtype = dtypes.int64
|
||||
self._table = lookup_ops.MutableHashTable(
|
||||
key_dtype=self.dtype,
|
||||
value_dtype=self._output_dtype,
|
||||
@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
# counting code in the Model object doesn't throw an attribute error.
|
||||
tracked_table.shape = tensor_shape.TensorShape((0,))
|
||||
|
||||
if self.num_oov_tokens <= 1:
|
||||
oov_tokens = None
|
||||
if self.num_oov_indices <= 1:
|
||||
oov_indices = None
|
||||
else:
|
||||
oov_start = 1 if reserve_zero else 0
|
||||
oov_tokens = list(range(oov_start, self._reserved_values))
|
||||
oov_start = 1 if mask_token is not None else 0
|
||||
oov_end = oov_start + num_oov_indices
|
||||
oov_indices = list(range(oov_start, oov_end))
|
||||
|
||||
self._table_handler = table_utils.TableHandler(
|
||||
table=self._table,
|
||||
oov_tokens=oov_tokens,
|
||||
oov_tokens=oov_indices,
|
||||
use_v1_apis=self._use_v1_apis())
|
||||
|
||||
if vocabulary is not None:
|
||||
if isinstance(vocabulary, str):
|
||||
vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
|
||||
table_utils.validate_vocabulary_is_unique(vocabulary)
|
||||
|
||||
self.set_vocabulary(vocabulary)
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
return input_shape
|
||||
|
||||
def compute_output_signature(self, input_spec, invert=False):
|
||||
def compute_output_signature(self, input_spec):
|
||||
output_shape = self.compute_output_shape(input_spec.shape.as_list())
|
||||
if invert:
|
||||
output_dtype = dtypes.string
|
||||
else:
|
||||
output_dtype = dtypes.int64
|
||||
output_dtype = dtypes.int64
|
||||
return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
|
||||
|
||||
def adapt(self, data, reset_state=True):
|
||||
@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
keys, values = self._table_handler.data()
|
||||
# This is required because the MutableHashTable doesn't preserve insertion
|
||||
# order, but we rely on the order of the array to assign indices.
|
||||
if self.dtype == dtypes.string:
|
||||
return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
|
||||
else:
|
||||
return [x for _, x in sorted(zip(values, keys))]
|
||||
return [x for _, x in sorted(zip(values, keys))]
|
||||
|
||||
def vocab_size(self):
|
||||
return self._table_handler.vocab_size()
|
||||
@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
def get_config(self):
|
||||
config = {
|
||||
"max_tokens": self.max_tokens,
|
||||
"num_oov_tokens": self.num_oov_tokens,
|
||||
"vocabulary": None,
|
||||
"reserve_zero": self.reserve_zero,
|
||||
"mask_zero": self.mask_zero,
|
||||
"num_oov_indices": self.num_oov_indices,
|
||||
"oov_token": self.oov_token,
|
||||
"mask_token": self.mask_token,
|
||||
}
|
||||
base_config = super(IndexLookup, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
|
||||
# abstraction for ease of saving!) we return 0.
|
||||
return 0
|
||||
|
||||
def set_vocabulary(self,
|
||||
vocab,
|
||||
append=False):
|
||||
def set_vocabulary(self, vocab):
|
||||
"""Sets vocabulary (and optionally document frequency) data for this layer.
|
||||
|
||||
This method sets the vocabulary for this layer directly, instead of
|
||||
analyzing a dataset through 'adapt'. It should be used whenever the vocab
|
||||
information is already known. If vocabulary data is already present in the
|
||||
layer, this method will either replace it, if 'append' is set to False, or
|
||||
append to it (if 'append' is set to True).
|
||||
layer, this method will either replace it
|
||||
|
||||
Arguments:
|
||||
vocab: An array of string tokens.
|
||||
append: Whether to overwrite or append any existing vocabulary data.
|
||||
|
||||
Raises:
|
||||
ValueError: If there are too many inputs, the inputs do not match, or
|
||||
input data is missing.
|
||||
"""
|
||||
current_table_size = self._table_handler.vocab_size()
|
||||
total_vocab_size = len(vocab) + (current_table_size if append else 0)
|
||||
if self.max_tokens is not None and total_vocab_size > self._max_elements:
|
||||
|
||||
table_utils.validate_vocabulary_is_unique(vocab)
|
||||
|
||||
should_have_mask = self.mask_token is not None
|
||||
if should_have_mask:
|
||||
has_mask = vocab[0] == self.mask_token
|
||||
oov_start = 1
|
||||
else:
|
||||
has_mask = False
|
||||
oov_start = 0
|
||||
|
||||
should_have_oov = self.num_oov_indices > 0
|
||||
if should_have_oov:
|
||||
oov_end = oov_start + self.num_oov_indices
|
||||
expected_oov = [self.oov_token] * self.num_oov_indices
|
||||
has_oov = vocab[oov_start:oov_end] == expected_oov
|
||||
# If we get a numpy array, then has_oov may end up being a numpy array
|
||||
# instead of a bool. Fix this by collapsing the variable if it's not bool.
|
||||
if not isinstance(has_oov, bool):
|
||||
has_oov = any(has_oov)
|
||||
else:
|
||||
has_oov = False
|
||||
|
||||
if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
|
||||
raise ValueError("The passed vocabulary has the correct mask token `%s` "
|
||||
"at index 0, but does not have the OOV token `%s` in "
|
||||
"indices [%s:%s]. Instead, we found `%s`. Was this "
|
||||
"vocabulary generated by a layer with incompatible "
|
||||
"settings?" %
|
||||
(self.mask_token, self.oov_token, oov_start, oov_end,
|
||||
vocab[oov_start:oov_end]))
|
||||
|
||||
if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
|
||||
raise ValueError(
|
||||
"The passed vocabulary has the correct OOV token `%s` at "
|
||||
"indices [%s:%s], but does not have the mask token `%s` in "
|
||||
"index 0. Instead, we found `%s`. Was this vocabulary "
|
||||
"generated by a layer with incompatible settings?" %
|
||||
(self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
|
||||
|
||||
insert_special_tokens = not has_oov and not has_mask
|
||||
|
||||
special_tokens = [] if self.mask_token is None else [self.mask_token]
|
||||
special_tokens.extend([self.oov_token] * self.num_oov_indices)
|
||||
|
||||
num_special_tokens = len(special_tokens)
|
||||
tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
|
||||
if self.mask_token in tokens:
|
||||
raise ValueError("Reserved mask token %s was found in the passed "
|
||||
"vocabulary at index %s. Please either remove the "
|
||||
"reserved token from the vocabulary or change the "
|
||||
"mask token for this layer." %
|
||||
(self.mask_token, tokens.index(self.mask_token)))
|
||||
if self.oov_token in tokens:
|
||||
raise ValueError("Reserved OOV token %s was found in the passed "
|
||||
"vocabulary at index %s. Please either remove the "
|
||||
"reserved token from the vocabulary or change the "
|
||||
"OOV token for this layer." %
|
||||
(self.oov_token, tokens.index(self.oov_token)))
|
||||
|
||||
if insert_special_tokens:
|
||||
total_vocab_size = len(vocab) + num_special_tokens
|
||||
else:
|
||||
total_vocab_size = len(vocab)
|
||||
if self.max_tokens is not None and total_vocab_size > self.max_tokens:
|
||||
raise ValueError(
|
||||
"Attempted to set a vocabulary larger than the maximum vocab size. "
|
||||
"Passed vocab size is %s, max vocab size is %s. Note that the OOV "
|
||||
"token(s) are automatically added to the number of tokens." %
|
||||
"Passed vocab size is %s, max vocab size is %s." %
|
||||
(total_vocab_size, self.max_tokens))
|
||||
|
||||
start_index = self._reserved_values + (current_table_size if append else 0)
|
||||
start_index = num_special_tokens
|
||||
values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
|
||||
vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
|
||||
table_utils.assert_same_type(self.dtype, vocab, "vocab")
|
||||
|
||||
values = table_utils.convert_to_ndarray(values, self._output_dtype)
|
||||
table_utils.assert_same_type(self._output_dtype, values, "values")
|
||||
|
||||
if not append and current_table_size > 0:
|
||||
self._table_handler.clear()
|
||||
self._table_handler.clear()
|
||||
self._table_handler.insert(vocab, values)
|
||||
|
||||
if insert_special_tokens and num_special_tokens > 0:
|
||||
special_token_values = np.arange(num_special_tokens, dtype=np.int64)
|
||||
self._table_handler.insert(special_tokens, special_token_values)
|
||||
|
||||
def _set_state_variables(self, updates):
|
||||
if not self.built:
|
||||
raise RuntimeError("_set_state_variables() must be called after build().")
|
||||
@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
|
||||
dataset, all tokens are retained.s
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_size=None):
|
||||
def __init__(self, vocab_size=None, mask_value=None):
|
||||
self._vocab_size = vocab_size
|
||||
self._mask_value = mask_value
|
||||
|
||||
def compute(self, values, accumulator=None):
|
||||
"""Compute a step in this computation, returning a new accumulator."""
|
||||
values = base_preprocessing_layer.convert_to_list(values)
|
||||
values = base_preprocessing_layer.convert_to_list(
|
||||
values, sparse_default_value=self._mask_value)
|
||||
|
||||
if accumulator is None:
|
||||
accumulator = self._create_accumulator()
|
||||
|
||||
# TODO(momernick): Benchmark improvements to this algorithm.
|
||||
if isinstance(values, (str, bytes)):
|
||||
if isinstance(values, (str, bytes, np.int64)):
|
||||
accumulator.count_dict[values] += 1
|
||||
else:
|
||||
for document in values:
|
||||
@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
|
||||
"vocab": A list of the retained items in the vocabulary.
|
||||
"""
|
||||
vocab_counts = accumulator.count_dict
|
||||
if self._mask_value in vocab_counts:
|
||||
del vocab_counts[self._mask_value]
|
||||
sorted_counts = sorted(
|
||||
vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
|
||||
vocab_data = (
|
||||
|
@ -65,7 +65,12 @@ class IndexLookupDistributionTest(
|
||||
|
||||
with distribution.scope():
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()()
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.adapt(vocab_dataset)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
|
@ -21,7 +21,6 @@ from __future__ import print_function
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
import six
|
||||
import string
|
||||
|
||||
from absl.testing import parameterized
|
||||
@ -31,7 +30,6 @@ from tensorflow.python import keras
|
||||
from tensorflow.python import tf2
|
||||
|
||||
from tensorflow.python.data.ops import dataset_ops
|
||||
from tensorflow.python.distribute import one_device_strategy
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import sparse_tensor
|
||||
@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
|
||||
from tensorflow.python.keras.saving import save
|
||||
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
|
||||
from tensorflow.python.ops.ragged import ragged_factory_ops
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
@ -71,6 +68,10 @@ def _get_end_to_end_test_cases():
|
||||
["and"], ["earth"], ["michigan"]]),
|
||||
"kwargs": {
|
||||
"max_tokens": None,
|
||||
"num_oov_indices": 1,
|
||||
"mask_token": "",
|
||||
"oov_token": "[OOV]",
|
||||
"dtype": dtypes.string,
|
||||
},
|
||||
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
|
||||
"input_dtype":
|
||||
@ -91,6 +92,9 @@ def _get_end_to_end_test_cases():
|
||||
dtype=np.int64),
|
||||
"kwargs": {
|
||||
"max_tokens": None,
|
||||
"num_oov_indices": 1,
|
||||
"mask_token": 0,
|
||||
"oov_token": -1,
|
||||
"dtype": dtypes.int64,
|
||||
},
|
||||
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
|
||||
@ -172,7 +176,12 @@ class CategoricalEncodingInputTest(
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -193,7 +202,12 @@ class CategoricalEncodingInputTest(
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -209,7 +223,12 @@ class CategoricalEncodingInputTest(
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -223,7 +242,12 @@ class CategoricalEncodingInputTest(
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest(
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
|
||||
layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=2,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest(
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=2,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest(
|
||||
expected_output = [[3, 4, 6], [6, 5, 3, 2]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
|
||||
layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=2,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest(
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=2,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest(
|
||||
dense_shape=[3, 4])
|
||||
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
|
||||
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.adapt(vocab_dataset)
|
||||
# Note that the expected vocabulary has a null string (''). This is because
|
||||
# we assume that sparse tensors are in fact dense tensors with elided
|
||||
# values, not ragged tensors. Therefore, we assume that any missing data
|
||||
# is important and give it a spot in our vocab.
|
||||
expected_vocabulary = ["", "michigan", "fire"]
|
||||
expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
|
||||
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
|
||||
|
||||
def test_ragged_adapt(self):
|
||||
@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest(
|
||||
["fire", "michigan"]])
|
||||
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
|
||||
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.adapt(vocab_dataset)
|
||||
expected_vocabulary = ["michigan", "fire"]
|
||||
expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
|
||||
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
|
||||
|
||||
def test_sparse_int_input(self):
|
||||
@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest(
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest(
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest(
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest(
|
||||
batched_ds = ds.take(2)
|
||||
input_t = keras.Input(shape=(), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=10, num_oov_tokens=0, reserve_zero=False)
|
||||
max_tokens=10,
|
||||
num_oov_indices=0,
|
||||
mask_token=None,
|
||||
oov_token=None,
|
||||
dtype=dtypes.string)
|
||||
_ = layer(input_t)
|
||||
layer.adapt(batched_ds)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IndexLookupDistributionTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_cpu_distribution(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
|
||||
|
||||
with strategy.scope():
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()()
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IndexLookupOutputTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()()
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
|
||||
|
||||
def test_output_shape(self):
|
||||
input_data = keras.Input(shape=(4,), dtype=dtypes.string)
|
||||
layer = get_layer_class()()
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
int_data = layer(input_data)
|
||||
self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
|
||||
|
||||
@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
|
||||
expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(reserve_zero=False)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=None,
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
layer = get_layer_class()(
|
||||
vocabulary=vocab_data,
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest
|
||||
):
|
||||
|
||||
def _write_to_temp_file(self, file_name, vocab_list):
|
||||
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
|
||||
with gfile.GFile(vocab_path, "w") as writer:
|
||||
for vocab in vocab_list:
|
||||
writer.write(vocab + "\n")
|
||||
writer.flush()
|
||||
writer.close()
|
||||
return vocab_path
|
||||
|
||||
def test_int_output_explicit_vocab(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
layer = get_layer_class()(
|
||||
vocabulary=vocab_data,
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_get_vocab_returns_str(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
layer_vocab = layer.get_vocabulary()
|
||||
self.assertAllEqual(vocab_data, layer_vocab)
|
||||
self.assertIsInstance(layer_vocab[0], six.text_type)
|
||||
def test_vocab_with_max_cap(self):
|
||||
vocab_data = ["", "[OOV]", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
returned_vocab = layer.get_vocabulary()
|
||||
self.assertAllEqual(vocab_data, returned_vocab)
|
||||
|
||||
def test_int_output_explicit_vocab_from_file(self):
|
||||
vocab_list = ["earth", "wind", "and", "fire"]
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
|
||||
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(vocabulary=vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_vocab_appending(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=5)
|
||||
layer.set_vocabulary(vocab_data[0])
|
||||
layer.set_vocabulary(vocab_data[1], append=True)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllClose(expected_output, output_dataset)
|
||||
def test_int_vocab_with_max_cap(self):
|
||||
vocab_data = [0, -1, 42, 1276, 1138]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
returned_vocab = layer.get_vocabulary()
|
||||
self.assertAllEqual(vocab_data, returned_vocab)
|
||||
|
||||
def test_non_unique_vocab_fails(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire", "fire"]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_data)
|
||||
_ = get_layer_class()(
|
||||
vocabulary=vocab_data,
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
|
||||
def test_non_unique_vocab_from_file_fails(self):
|
||||
vocab_list = ["earth", "wind", "and", "fire", "earth"]
|
||||
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
|
||||
def test_vocab_with_oov_and_wrong_mask_fails(self):
|
||||
vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_vocab_with_oov_and_no_mask_fails(self):
|
||||
vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_vocab_with_mask_but_no_oov_fails(self):
|
||||
vocab_data = ["", "earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_vocab_with_repeated_element_fails(self):
|
||||
vocab_data = ["earth", "earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_path)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_vocab_with_reserved_oov_element_fails(self):
|
||||
vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class InverseLookupOutputTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
def test_vocab_with_reserved_mask_element_fails(self):
|
||||
vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="mask_token",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def DISABLE_test_inverse_output(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
# Note that the token 'michigan' has been replaced by ''. This is because
|
||||
# 'michigan' is OOV for this layer.
|
||||
expected_strings = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", ""]])
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
string_data = layer(int_data, invert=True)
|
||||
model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
|
||||
int_outputs, string_outputs = model.predict(input_array)
|
||||
self.assertAllEqual(expected_ints, int_outputs)
|
||||
self.assertAllEqual(expected_strings, string_outputs)
|
||||
def test_non_unique_int_vocab_fails(self):
|
||||
vocab_data = [12, 13, 14, 15, 15]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
|
||||
_ = get_layer_class()(
|
||||
vocabulary=vocab_data,
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
|
||||
def DISABLE_test_inverse_output_serialization(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
# Note that the token 'michigan' has been replaced by ''. This is because
|
||||
# 'michigan' is OOV for this layer.
|
||||
expected_strings = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", ""]])
|
||||
def test_int_vocab_with_oov_and_wrong_mask_fails(self):
|
||||
vocab_data = [1234, -1, 11, 21, 13, 14]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
string_data = layer(int_data, invert=True)
|
||||
model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
|
||||
def test_int_vocab_with_oov_and_no_mask_fails(self):
|
||||
vocab_data = [-1, 11, 12, 13, 14]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
with CustomObjectScope({"IndexLookup": get_layer_class()}):
|
||||
new_model = keras.Model.from_config(model.get_config())
|
||||
new_model.set_weights(model.get_weights())
|
||||
int_outputs, string_outputs = new_model.predict(input_array)
|
||||
self.assertAllEqual(expected_ints, int_outputs)
|
||||
self.assertAllEqual(expected_strings, string_outputs)
|
||||
def test_int_vocab_with_mask_but_no_oov_fails(self):
|
||||
vocab_data = [0, 11, 12, 13, 14]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_int_vocab_with_repeated_element_fails(self):
|
||||
vocab_data = [11, 11, 34, 23, 124]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_int_vocab_with_reserved_oov_element_fails(self):
|
||||
vocab_data = [14, 38, -1, 34, 3, 84]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_int_vocab_with_reserved_mask_element_fails(self):
|
||||
vocab_data = [125, 0, 3, 4, 94]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token=0,
|
||||
oov_token=-1,
|
||||
dtype=dtypes.int64)
|
||||
with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
|
||||
@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=10)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=10,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=10)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=10,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
|
||||
def test_too_long_vocab_fails_in_single_setting(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
layer = get_layer_class()(max_tokens=4)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=4,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"vocabulary larger than the maximum vocab.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_too_long_vocab_fails_in_multiple_settings(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
layer = get_layer_class()(max_tokens=4)
|
||||
|
||||
# The first time we call set_vocabulary, we're under the max_tokens
|
||||
# so it should be fine.
|
||||
layer.set_vocabulary(vocab_data[0])
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"vocabulary larger than the maximum vocab.*"):
|
||||
layer.set_vocabulary(vocab_data[1], append=True)
|
||||
|
||||
def test_zero_max_tokens_fails(self):
|
||||
with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
|
||||
_ = get_layer_class()(max_tokens=0)
|
||||
_ = get_layer_class()(
|
||||
max_tokens=0,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
|
||||
|
||||
# Build and validate a golden model.
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=None)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
dtype=dtypes.string)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IndexLookupCombinerTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
class IndexLookupStringCombinerTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def compare_text_accumulators(self, a, b, msg=None):
|
||||
if a is None or b is None:
|
||||
@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
|
||||
self.validate_accumulator_extract(combiner, data, expected_extract_output)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest
|
||||
):
|
||||
|
||||
def compare_text_accumulators(self, a, b, msg=None):
|
||||
if a is None or b is None:
|
||||
self.assertAllEqual(a, b, msg=msg)
|
||||
|
||||
self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
|
||||
|
||||
compare_accumulators = compare_text_accumulators
|
||||
|
||||
def update_accumulator(self, accumulator, data):
|
||||
accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
|
||||
|
||||
return accumulator
|
||||
|
||||
def test_combiner_api_compatibility_int_mode(self):
|
||||
data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
|
||||
combiner = index_lookup._IndexLookupCombiner()
|
||||
expected_accumulator_output = {
|
||||
"vocab": np.array([1138, 725, 42, 1729, 203]),
|
||||
"counts": np.array([2, 2, 2, 1, 1]),
|
||||
}
|
||||
expected_extract_output = {
|
||||
"vocab": np.array([1138, 725, 42, 1729, 203]),
|
||||
}
|
||||
expected_accumulator = combiner._create_accumulator()
|
||||
expected_accumulator = self.update_accumulator(expected_accumulator,
|
||||
expected_accumulator_output)
|
||||
self.validate_accumulator_serialize_and_deserialize(combiner, data,
|
||||
expected_accumulator)
|
||||
self.validate_accumulator_uniqueness(combiner, data)
|
||||
self.validate_accumulator_extract(combiner, data, expected_extract_output)
|
||||
|
||||
# TODO(askerryryan): Add tests confirming equivalence to behavior of
|
||||
# existing tf.keras.preprocessing.text.Tokenizer.
|
||||
@parameterized.named_parameters(
|
||||
{
|
||||
"testcase_name": "top_k_smaller_than_full_vocab",
|
||||
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
|
||||
"vocab_size": 3,
|
||||
"expected_accumulator_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
"counts": np.array([3, 2, 1, 1]),
|
||||
},
|
||||
"expected_extract_output": {
|
||||
"vocab": np.array([1138, 1729, 725]),
|
||||
},
|
||||
},
|
||||
{
|
||||
"testcase_name": "top_k_larger_than_full_vocab",
|
||||
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
|
||||
"vocab_size": 10,
|
||||
"expected_accumulator_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
"counts": np.array([3, 2, 1, 1]),
|
||||
},
|
||||
"expected_extract_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
},
|
||||
},
|
||||
{
|
||||
"testcase_name": "no_top_k",
|
||||
"data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
|
||||
"vocab_size": None,
|
||||
"expected_accumulator_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
"counts": np.array([3, 2, 1, 1]),
|
||||
},
|
||||
"expected_extract_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
},
|
||||
},
|
||||
{
|
||||
"testcase_name": "single_element_per_row",
|
||||
"data": np.array([[42], [1138], [1729], [1138], [725]]),
|
||||
"vocab_size": 3,
|
||||
"expected_accumulator_output": {
|
||||
"vocab": np.array([1138, 1729, 725, 42]),
|
||||
"counts": np.array([2, 1, 1, 1]),
|
||||
},
|
||||
"expected_extract_output": {
|
||||
"vocab": np.array([1138, 1729, 725]),
|
||||
},
|
||||
},
|
||||
# Which tokens are retained are based on global frequency, and thus are
|
||||
# sensitive to frequency within a document. In contrast, because idf only
|
||||
# considers the presence of a token in a document, it is insensitive
|
||||
# to the frequency of the token within the document.
|
||||
{
|
||||
"testcase_name":
|
||||
"retained_tokens_sensitive_to_within_document_frequency",
|
||||
"data":
|
||||
np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
|
||||
[725, 203]]),
|
||||
"vocab_size":
|
||||
3,
|
||||
"expected_accumulator_output": {
|
||||
"vocab": np.array([1138, 42, 1729, 725, 203]),
|
||||
"counts": np.array([4, 2, 2, 1, 1]),
|
||||
},
|
||||
"expected_extract_output": {
|
||||
"vocab": np.array([1138, 1729, 42]),
|
||||
},
|
||||
})
|
||||
def test_combiner_computation(self, data, vocab_size,
|
||||
expected_accumulator_output,
|
||||
expected_extract_output):
|
||||
combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
|
||||
expected_accumulator = combiner._create_accumulator()
|
||||
expected_accumulator = self.update_accumulator(expected_accumulator,
|
||||
expected_accumulator_output)
|
||||
self.validate_accumulator_computation(combiner, data, expected_accumulator)
|
||||
self.validate_accumulator_extract(combiner, data, expected_extract_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test.main()
|
||||
|
112
tensorflow/python/keras/layers/preprocessing/integer_lookup.py
Normal file
112
tensorflow/python/keras/layers/preprocessing/integer_lookup.py
Normal file
@ -0,0 +1,112 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Keras string lookup preprocessing layer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import table_utils
|
||||
|
||||
|
||||
class IntegerLookup(index_lookup.IndexLookup):
|
||||
"""Maps integers from a vocabulary to integer indices.
|
||||
|
||||
This layer translates a set of arbitrary integers into an integer output via a
|
||||
table-based lookup, with optional out-of-vocabulary handling.
|
||||
|
||||
If desired, the user can call this layer's `adapt()` method on a data set,
|
||||
which will analyze the data set, determine the frequency of individual string
|
||||
values, and create a vocabulary from them. This vocabulary can have
|
||||
unlimited size or be capped, depending on the configuration options for this
|
||||
layer; if there are more unique values in the input than the maximum
|
||||
vocabulary size, the most frequent terms will be used to create the
|
||||
vocabulary.
|
||||
|
||||
Attributes:
|
||||
max_values: The maximum size of the vocabulary for this layer. If None,
|
||||
there is no cap on the size of the vocabulary. Note that this vocabulary
|
||||
includes the OOV and mask tokens, so the effective number of tokens is
|
||||
(max_tokens - num_oov_tokens - (1 if mask_token else 0))
|
||||
num_oov_indices: The number of out-of-vocabulary values to use; defaults to
|
||||
1. If this value is more than 1, OOV inputs are hashed to determine their
|
||||
OOV value; if this value is 0, passing an OOV input will result in a '-1'
|
||||
being returned for that value in the output tensor. (Note that, because
|
||||
the value is -1 and not 0, this will allow you to effectively drop OOV
|
||||
values from categorical encodings.)
|
||||
mask_value: A value that represents masked inputs, and which is mapped to
|
||||
index 0. Defaults to 0. If set to None, no mask term will be added and the
|
||||
OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
|
||||
(1...num_oov_tokens+1).
|
||||
oov_value: The value representing an out-of-vocabulary value. Defaults to
|
||||
-1.
|
||||
vocabulary: An optional list of values, or a path to a text file containing
|
||||
a vocabulary to load into this layer. The file should contain one value
|
||||
per line. If the list or file contains the same token multiple times, an
|
||||
error will be thrown.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_values=None,
|
||||
num_oov_indices=1,
|
||||
mask_value=0,
|
||||
oov_value=-1,
|
||||
vocabulary=None,
|
||||
**kwargs):
|
||||
allowed_dtypes = [dtypes.int64]
|
||||
|
||||
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
|
||||
raise ValueError("IntegerLookup may only have a dtype in %s." %
|
||||
allowed_dtypes)
|
||||
|
||||
if "dtype" not in kwargs:
|
||||
kwargs["dtype"] = dtypes.int64
|
||||
|
||||
# If max_values is set, the value must be greater than 1 - otherwise we
|
||||
# are creating a 0-element vocab, which doesn't make sense.
|
||||
if max_values is not None and max_values <= 1:
|
||||
raise ValueError("If set, max_values must be greater than 1.")
|
||||
|
||||
if num_oov_indices < 0:
|
||||
raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
|
||||
num_oov_indices)
|
||||
|
||||
if vocabulary is not None:
|
||||
if isinstance(vocabulary, str):
|
||||
vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
|
||||
vocabulary = [int(v) for v in vocabulary]
|
||||
|
||||
super(IntegerLookup, self).__init__(
|
||||
max_tokens=max_values,
|
||||
num_oov_indices=num_oov_indices,
|
||||
mask_token=mask_value,
|
||||
oov_token=oov_value,
|
||||
vocabulary=vocabulary,
|
||||
**kwargs)
|
||||
|
||||
def get_config(self):
|
||||
base_config = super(IntegerLookup, self).get_config()
|
||||
# Because the super config has a bunch of args we're also passing,
|
||||
# we need to rename and remove them from the config dict.
|
||||
base_config["max_values"] = base_config["max_tokens"]
|
||||
del base_config["max_tokens"]
|
||||
|
||||
base_config["mask_value"] = base_config["mask_token"]
|
||||
del base_config["mask_token"]
|
||||
|
||||
base_config["oov_value"] = base_config["oov_token"]
|
||||
del base_config["oov_token"]
|
||||
return base_config
|
@ -0,0 +1,501 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Tests for Keras text vectorization preprocessing layer."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import random
|
||||
|
||||
from absl.testing import parameterized
|
||||
import numpy as np
|
||||
|
||||
from tensorflow.python import keras
|
||||
from tensorflow.python import tf2
|
||||
|
||||
from tensorflow.python.data.ops import dataset_ops
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import sparse_tensor
|
||||
from tensorflow.python.framework import tensor_shape
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.layers.preprocessing import integer_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
|
||||
from tensorflow.python.keras.saving import save
|
||||
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
|
||||
from tensorflow.python.ops.ragged import ragged_factory_ops
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
def get_layer_class():
|
||||
if context.executing_eagerly():
|
||||
return integer_lookup.IntegerLookup
|
||||
else:
|
||||
return integer_lookup_v1.IntegerLookup
|
||||
|
||||
|
||||
def _get_end_to_end_test_cases():
|
||||
test_cases = (
|
||||
{
|
||||
"testcase_name":
|
||||
"test_ints_soft_vocab_cap",
|
||||
# Create an array where 1138 is the most frequent term, followed by
|
||||
# 1729, then 725, then 42. This ensures that the vocab accumulator
|
||||
# is sorting by frequency.
|
||||
"vocab_data":
|
||||
np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
|
||||
[1729], [725], [725]],
|
||||
dtype=np.int64),
|
||||
"input_data":
|
||||
np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
|
||||
dtype=np.int64),
|
||||
"kwargs": {
|
||||
"max_values": None,
|
||||
"dtype": dtypes.int64,
|
||||
},
|
||||
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
|
||||
"input_dtype":
|
||||
dtypes.int64
|
||||
},)
|
||||
|
||||
crossed_test_cases = []
|
||||
# Cross above test cases with use_dataset in (True, False)
|
||||
for use_dataset in (True, False):
|
||||
for case in test_cases:
|
||||
case = case.copy()
|
||||
if use_dataset:
|
||||
case["testcase_name"] = case["testcase_name"] + "_with_dataset"
|
||||
case["use_dataset"] = use_dataset
|
||||
crossed_test_cases.append(case)
|
||||
|
||||
return crossed_test_cases
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IntegerLookupLayerTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
@parameterized.named_parameters(*_get_end_to_end_test_cases())
|
||||
def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
|
||||
use_dataset, expected_output,
|
||||
input_dtype):
|
||||
cls = get_layer_class()
|
||||
expected_output_dtype = dtypes.int64
|
||||
input_shape = input_data.shape
|
||||
|
||||
if use_dataset:
|
||||
# Keras APIs expect batched datasets.
|
||||
# TODO(rachelim): `model.predict` predicts the result on each
|
||||
# dataset batch separately, then tries to concatenate the results
|
||||
# together. When the results have different shapes on the non-concat
|
||||
# axis (which can happen in the output_mode = INT case for
|
||||
# IntegerLookup), the concatenation fails. In real use cases, this may
|
||||
# not be an issue because users are likely to pipe the preprocessing layer
|
||||
# into other keras layers instead of predicting it directly. A workaround
|
||||
# for these unit tests is to have the dataset only contain one batch, so
|
||||
# no concatenation needs to happen with the result. For consistency with
|
||||
# numpy input, we should make `predict` join differently shaped results
|
||||
# together sensibly, with 0 padding.
|
||||
input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
|
||||
input_shape[0])
|
||||
vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
|
||||
input_shape[0])
|
||||
|
||||
with CustomObjectScope({"IntegerLookup": cls}):
|
||||
output_data = testing_utils.layer_test(
|
||||
cls,
|
||||
kwargs=kwargs,
|
||||
input_shape=input_shape,
|
||||
input_data=input_data,
|
||||
input_dtype=input_dtype,
|
||||
expected_output_dtype=expected_output_dtype,
|
||||
validate_training=False,
|
||||
adapt_data=vocab_data)
|
||||
self.assertAllClose(expected_output, output_data)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class CategoricalEncodingInputTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_sparse_int_input(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = sparse_tensor.SparseTensor(
|
||||
indices=[[0, 0], [1, 2]],
|
||||
values=np.array([13, 32], dtype=np.int64),
|
||||
dense_shape=[3, 4])
|
||||
|
||||
expected_indices = [[0, 0], [1, 2]]
|
||||
expected_values = [5, 1]
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(max_values=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_data = model.predict(input_array, steps=1)
|
||||
self.assertAllEqual(expected_indices, output_data.indices)
|
||||
self.assertAllEqual(expected_values, output_data.values)
|
||||
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
|
||||
|
||||
def test_ragged_int_input(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
|
||||
dtype=np.int64)
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(max_values=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class CategoricalEncodingMultiOOVTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_sparse_int_input_multi_bucket(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = sparse_tensor.SparseTensor(
|
||||
indices=[[0, 0], [1, 2]],
|
||||
values=np.array([13, 133], dtype=np.int64),
|
||||
dense_shape=[3, 4])
|
||||
|
||||
expected_indices = [[0, 0], [1, 2]]
|
||||
expected_values = [6, 2]
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(
|
||||
max_values=None,
|
||||
dtype=dtypes.int64,
|
||||
num_oov_indices=2,
|
||||
mask_value=0,
|
||||
oov_value=-1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_data = model.predict(input_array, steps=1)
|
||||
self.assertAllEqual(expected_indices, output_data.indices)
|
||||
self.assertAllEqual(expected_values, output_data.values)
|
||||
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
|
||||
|
||||
def test_ragged_int_input_multi_bucket(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
|
||||
dtype=np.int64)
|
||||
expected_output = [[3, 4, 6], [6, 5, 3, 2]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(max_values=None, num_oov_indices=2)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class CategoricalEncodingAdaptTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_sparse_adapt(self):
|
||||
vocab_data = sparse_tensor.SparseTensor(
|
||||
indices=[[0, 0], [0, 1], [1, 2]],
|
||||
values=[203, 1729, 203],
|
||||
dense_shape=[3, 4])
|
||||
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
|
||||
|
||||
layer = get_layer_class()()
|
||||
layer.adapt(vocab_dataset)
|
||||
expected_vocabulary = [0, -1, 203, 1729]
|
||||
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
|
||||
|
||||
def test_ragged_adapt(self):
|
||||
vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
|
||||
vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
|
||||
|
||||
layer = get_layer_class()()
|
||||
layer.adapt(vocab_dataset)
|
||||
expected_vocabulary = [0, -1, 203, 1729]
|
||||
self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
|
||||
|
||||
def test_sparse_int_input(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = sparse_tensor.SparseTensor(
|
||||
indices=[[0, 0], [1, 2]],
|
||||
values=np.array([13, 32], dtype=np.int64),
|
||||
dense_shape=[3, 4])
|
||||
|
||||
expected_indices = [[0, 0], [1, 2]]
|
||||
expected_values = [5, 1]
|
||||
expected_dense_shape = [3, 4]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
|
||||
layer = get_layer_class()(max_values=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_data = model.predict(input_array, steps=1)
|
||||
self.assertAllEqual(expected_indices, output_data.indices)
|
||||
self.assertAllEqual(expected_values, output_data.values)
|
||||
self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
|
||||
|
||||
def test_ragged_int_input(self):
|
||||
vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
|
||||
input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
|
||||
dtype=np.int64)
|
||||
expected_output = [[2, 3, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
|
||||
layer = get_layer_class()(max_values=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_single_int_generator_dataset(self):
|
||||
|
||||
def word_gen():
|
||||
for _ in itertools.count(1):
|
||||
yield random.randint(0, 100)
|
||||
|
||||
ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
|
||||
tensor_shape.TensorShape([]))
|
||||
batched_ds = ds.take(2)
|
||||
input_t = keras.Input(shape=(), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
|
||||
_ = layer(input_t)
|
||||
layer.adapt(batched_ds)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IntegerLookupOutputTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_int_output(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()()
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_output_shape(self):
|
||||
input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(max_values=None, num_oov_indices=1)
|
||||
int_data = layer(input_data)
|
||||
self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
|
||||
|
||||
def test_int_output_no_reserved_zero(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(max_values=None, mask_value=None)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_int_output_explicit_vocab(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(
|
||||
vocabulary=vocab_data,
|
||||
max_values=None,
|
||||
)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IntegerLookupVocabularyTest(
|
||||
keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def _write_to_temp_file(self, file_name, vocab_list):
|
||||
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
|
||||
with gfile.GFile(vocab_path, "w") as writer:
|
||||
for vocab in vocab_list:
|
||||
writer.write(str(vocab) + "\n")
|
||||
writer.flush()
|
||||
writer.close()
|
||||
return vocab_path
|
||||
|
||||
def test_int_output_explicit_vocab(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_get_vocab_returns_int(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
expected_vocab = [0, -1, 42, 1138, 725, 1729]
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
layer_vocab = layer.get_vocabulary()
|
||||
self.assertAllEqual(expected_vocab, layer_vocab)
|
||||
self.assertIsInstance(layer_vocab[0], np.int64)
|
||||
|
||||
def test_int_output_explicit_vocab_from_file(self):
|
||||
vocab_list = [42, 1138, 725, 1729]
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
|
||||
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(vocabulary=vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_non_unique_vocab_fails(self):
|
||||
vocab_data = [42, 1138, 725, 1729, 1729]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_data)
|
||||
|
||||
def test_non_unique_vocab_from_file_fails(self):
|
||||
vocab_list = [42, 1138, 725, 1729, 42]
|
||||
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_path)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
|
||||
class IntegerLookupSaveableTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest
|
||||
):
|
||||
|
||||
def test_ops_are_not_added_with_multiple_get_set_weights(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(max_values=10)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
weights = model.get_weights()
|
||||
model.set_weights(weights)
|
||||
keras.backend.get_session().graph.finalize()
|
||||
weights = model.get_weights()
|
||||
model.set_weights(weights)
|
||||
|
||||
def test_layer_saving_with_h5(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(max_values=10)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
path = os.path.join(self.get_temp_dir(), "model")
|
||||
with self.assertRaisesRegex(NotImplementedError,
|
||||
"Save or restore weights that is not.*"):
|
||||
save.save_model(model, path, save_format="h5")
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IntegerLookupErrorTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_too_long_vocab_fails_in_single_setting(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
|
||||
layer = get_layer_class()(max_values=4, num_oov_indices=1)
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"vocabulary larger than the maximum vocab.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_zero_max_values_fails(self):
|
||||
with self.assertRaisesRegex(ValueError, ".*max_values.*"):
|
||||
_ = get_layer_class()(max_values=0, num_oov_indices=1)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class IntegerLookupSavingTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_vocabulary_persistence_across_saving(self):
|
||||
vocab_data = [42, 1138, 725, 1729]
|
||||
input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
# Build and validate a golden model.
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
|
||||
layer = get_layer_class()(max_values=None, num_oov_indices=1)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(output_dataset, expected_output)
|
||||
|
||||
# Save the model to disk.
|
||||
output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
|
||||
model.save(output_path, save_format="tf")
|
||||
|
||||
# Delete the session and graph to ensure that the loaded model is generated
|
||||
# from scratch.
|
||||
# TODO(b/149526183): Can't clear session when TF2 is disabled.
|
||||
if tf2.enabled():
|
||||
keras.backend.clear_session()
|
||||
|
||||
loaded_model = keras.models.load_model(
|
||||
output_path, custom_objects={"IntegerLookup": get_layer_class()})
|
||||
|
||||
# Ensure that the loaded model is unique (so that the save/load is real)
|
||||
self.assertIsNot(model, loaded_model)
|
||||
|
||||
# Validate correctness of the new model.
|
||||
new_output_dataset = loaded_model.predict(input_array)
|
||||
self.assertAllEqual(new_output_dataset, expected_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test.main()
|
@ -0,0 +1,25 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Keras string lookup preprocessing layer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import integer_lookup
|
||||
|
||||
|
||||
class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
|
||||
pass
|
106
tensorflow/python/keras/layers/preprocessing/string_lookup.py
Normal file
106
tensorflow/python/keras/layers/preprocessing/string_lookup.py
Normal file
@ -0,0 +1,106 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Keras string lookup preprocessing layer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import table_utils
|
||||
|
||||
|
||||
class StringLookup(index_lookup.IndexLookup):
|
||||
"""Maps strings from a vocabulary to integer indices.
|
||||
|
||||
This layer translates a set of arbitrary strings into an integer output via a
|
||||
table-based lookup, with optional out-of-vocabulary handling.
|
||||
|
||||
If desired, the user can call this layer's `adapt()` method on a data set,
|
||||
which will analyze the data set, determine the frequency of individual string
|
||||
values, and create a vocabulary from them. This vocabulary can have
|
||||
unlimited size or be capped, depending on the configuration options for this
|
||||
layer; if there are more unique values in the input than the maximum
|
||||
vocabulary size, the most frequent terms will be used to create the
|
||||
vocabulary.
|
||||
|
||||
Attributes:
|
||||
max_tokens: The maximum size of the vocabulary for this layer. If None,
|
||||
there is no cap on the size of the vocabulary. Note that this vocabulary
|
||||
includes the OOV and mask tokens, so the effective number of tokens is
|
||||
(max_tokens - num_oov_indices - (1 if mask_token else 0))
|
||||
num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
|
||||
1. If this value is more than 1, OOV inputs are hashed to determine their
|
||||
OOV value; if this value is 0, passing an OOV input will result in a '-1'
|
||||
being returned for that value in the output tensor. (Note that, because
|
||||
the value is -1 and not 0, this will allow you to effectively drop OOV
|
||||
values from categorical encodings.)
|
||||
mask_token: A token that represents masked values, and which is mapped to
|
||||
index 0. Defaults to the empty string "". If set to None, no mask term
|
||||
will be added and the OOV tokens, if any, will be indexed from
|
||||
(0...num_oov_indices) instead of (1...num_oov_indices+1).
|
||||
oov_token: The token representing an out-of-vocabulary value. Defaults to
|
||||
"[OOV]".
|
||||
vocabulary: An optional list of vocabulary terms, or a path to a text file
|
||||
containing a vocabulary to load into this layer. The file should contain
|
||||
one token per line. If the list or file contains the same token multiple
|
||||
times, an error will be thrown.
|
||||
encoding: The Python string encoding to use. Defaults to `'utf-8'`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
max_tokens=None,
|
||||
num_oov_indices=1,
|
||||
mask_token="",
|
||||
oov_token="[OOV]",
|
||||
vocabulary=None,
|
||||
encoding="utf-8",
|
||||
**kwargs):
|
||||
allowed_dtypes = [dtypes.string]
|
||||
|
||||
if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
|
||||
raise ValueError("StringLookup may only have a dtype in %s." %
|
||||
allowed_dtypes)
|
||||
|
||||
if "dtype" not in kwargs:
|
||||
kwargs["dtype"] = dtypes.string
|
||||
|
||||
if vocabulary is not None:
|
||||
if isinstance(vocabulary, str):
|
||||
vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
|
||||
|
||||
self.encoding = encoding
|
||||
|
||||
super(StringLookup, self).__init__(
|
||||
max_tokens=max_tokens,
|
||||
num_oov_indices=num_oov_indices,
|
||||
mask_token=mask_token,
|
||||
oov_token=oov_token,
|
||||
vocabulary=vocabulary,
|
||||
**kwargs)
|
||||
|
||||
def get_config(self):
|
||||
config = {"encoding": self.encoding}
|
||||
base_config = super(StringLookup, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
def get_vocabulary(self):
|
||||
if self._table_handler.vocab_size() == 0:
|
||||
return []
|
||||
|
||||
keys, values = self._table_handler.data()
|
||||
# This is required because the MutableHashTable doesn't preserve insertion
|
||||
# order, but we rely on the order of the array to assign indices.
|
||||
return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
|
@ -0,0 +1,224 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Tests for Keras text vectorization preprocessing layer."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
from absl.testing import parameterized
|
||||
import numpy as np
|
||||
import six
|
||||
|
||||
from tensorflow.python import keras
|
||||
|
||||
from tensorflow.python.data.ops import dataset_ops
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.keras import keras_parameterized
|
||||
from tensorflow.python.keras import testing_utils
|
||||
from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
|
||||
from tensorflow.python.keras.saving import save
|
||||
from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
def get_layer_class():
|
||||
if context.executing_eagerly():
|
||||
return string_lookup.StringLookup
|
||||
else:
|
||||
return string_lookup_v1.StringLookup
|
||||
|
||||
|
||||
def _get_end_to_end_test_cases():
|
||||
test_cases = (
|
||||
{
|
||||
"testcase_name":
|
||||
"test_strings_soft_vocab_cap",
|
||||
# Create an array where 'earth' is the most frequent term, followed by
|
||||
# 'wind', then 'and', then 'fire'. This ensures that the vocab
|
||||
# accumulator is sorting by frequency.
|
||||
"vocab_data":
|
||||
np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
|
||||
["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
|
||||
"input_data":
|
||||
np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
|
||||
["and"], ["earth"], ["michigan"]]),
|
||||
"kwargs": {
|
||||
"max_tokens": None,
|
||||
},
|
||||
"expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
|
||||
"input_dtype":
|
||||
dtypes.string
|
||||
},
|
||||
)
|
||||
|
||||
crossed_test_cases = []
|
||||
# Cross above test cases with use_dataset in (True, False)
|
||||
for use_dataset in (True, False):
|
||||
for case in test_cases:
|
||||
case = case.copy()
|
||||
if use_dataset:
|
||||
case["testcase_name"] = case["testcase_name"] + "_with_dataset"
|
||||
case["use_dataset"] = use_dataset
|
||||
crossed_test_cases.append(case)
|
||||
|
||||
return crossed_test_cases
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class StringLookupLayerTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
@parameterized.named_parameters(*_get_end_to_end_test_cases())
|
||||
def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
|
||||
use_dataset, expected_output,
|
||||
input_dtype):
|
||||
cls = get_layer_class()
|
||||
expected_output_dtype = dtypes.int64
|
||||
input_shape = input_data.shape
|
||||
|
||||
if use_dataset:
|
||||
# Keras APIs expect batched datasets.
|
||||
# TODO(rachelim): `model.predict` predicts the result on each
|
||||
# dataset batch separately, then tries to concatenate the results
|
||||
# together. When the results have different shapes on the non-concat
|
||||
# axis (which can happen in the output_mode = INT case for
|
||||
# StringLookup), the concatenation fails. In real use cases, this may
|
||||
# not be an issue because users are likely to pipe the preprocessing layer
|
||||
# into other keras layers instead of predicting it directly. A workaround
|
||||
# for these unit tests is to have the dataset only contain one batch, so
|
||||
# no concatenation needs to happen with the result. For consistency with
|
||||
# numpy input, we should make `predict` join differently shaped results
|
||||
# together sensibly, with 0 padding.
|
||||
input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
|
||||
input_shape[0])
|
||||
vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
|
||||
input_shape[0])
|
||||
|
||||
with CustomObjectScope({"StringLookup": cls}):
|
||||
output_data = testing_utils.layer_test(
|
||||
cls,
|
||||
kwargs=kwargs,
|
||||
input_shape=input_shape,
|
||||
input_data=input_data,
|
||||
input_dtype=input_dtype,
|
||||
expected_output_dtype=expected_output_dtype,
|
||||
validate_training=False,
|
||||
adapt_data=vocab_data)
|
||||
self.assertAllClose(expected_output, output_data)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes
|
||||
class StringLookupVocabularyTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest
|
||||
):
|
||||
|
||||
def _write_to_temp_file(self, file_name, vocab_list):
|
||||
vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
|
||||
with gfile.GFile(vocab_path, "w") as writer:
|
||||
for vocab in vocab_list:
|
||||
writer.write(vocab + "\n")
|
||||
writer.flush()
|
||||
writer.close()
|
||||
return vocab_path
|
||||
|
||||
def test_int_output_explicit_vocab(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_get_vocab_returns_str(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
|
||||
layer = get_layer_class()(vocabulary=vocab_data)
|
||||
layer_vocab = layer.get_vocabulary()
|
||||
self.assertAllEqual(expected_vocab, layer_vocab)
|
||||
self.assertIsInstance(layer_vocab[0], six.text_type)
|
||||
|
||||
def test_int_output_explicit_vocab_from_file(self):
|
||||
vocab_list = ["earth", "wind", "and", "fire"]
|
||||
vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
|
||||
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(vocabulary=vocab_path)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_non_unique_vocab_fails(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire", "fire"]
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_data)
|
||||
|
||||
def test_non_unique_vocab_from_file_fails(self):
|
||||
vocab_list = ["earth", "wind", "and", "fire", "earth"]
|
||||
vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
|
||||
with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
|
||||
_ = get_layer_class()(vocabulary=vocab_path)
|
||||
|
||||
|
||||
@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
|
||||
class StringLookupSaveableTest(keras_parameterized.TestCase,
|
||||
preprocessing_test_utils.PreprocessingLayerTest):
|
||||
|
||||
def test_ops_are_not_added_with_multiple_get_set_weights(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=10)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
weights = model.get_weights()
|
||||
model.set_weights(weights)
|
||||
keras.backend.get_session().graph.finalize()
|
||||
weights = model.get_weights()
|
||||
model.set_weights(weights)
|
||||
|
||||
def test_layer_saving_with_h5(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(max_tokens=10)
|
||||
layer.set_vocabulary(vocab_data)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
path = os.path.join(self.get_temp_dir(), "model")
|
||||
with self.assertRaisesRegex(NotImplementedError,
|
||||
"Save or restore weights that is not.*"):
|
||||
save.save_model(model, path, save_format="h5")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test.main()
|
@ -0,0 +1,25 @@
|
||||
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Keras string lookup preprocessing layer."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup
|
||||
|
||||
|
||||
class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
|
||||
pass
|
@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None):
|
||||
if np.can_cast(array.dtype, np_dtype):
|
||||
array = array.astype(np_dtype, casting="safe")
|
||||
return array
|
||||
|
||||
|
@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
|
||||
from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
|
||||
from tensorflow.python.keras.layers.preprocessing import categorical_encoding
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup
|
||||
from tensorflow.python.keras.utils import layer_utils
|
||||
from tensorflow.python.ops import array_ops
|
||||
from tensorflow.python.ops import control_flow_ops
|
||||
@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
|
||||
self._max_tokens = max_tokens
|
||||
|
||||
# In INT mode, we have two reserved values (PAD and OOV). However, non-INT
|
||||
# modes don't have a PAD value, so we only need to reserve one value.
|
||||
self._reserved_values = 2 if output_mode == INT else 1
|
||||
|
||||
# In INT mode, the zero value is reserved for padding (per Keras standard
|
||||
# padding approaches). In non-INT modes, there is no padding so we can set
|
||||
# the OOV value to zero instead of one.
|
||||
@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
self._max_vocab_size, compute_idf=output_mode == TFIDF),
|
||||
**kwargs)
|
||||
|
||||
reserve_zero = output_mode in [None, INT]
|
||||
mask_token = "" if output_mode in [None, INT] else None
|
||||
self._index_lookup_layer = self._get_index_lookup_class()(
|
||||
max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
|
||||
max_tokens=max_tokens, mask_token=mask_token)
|
||||
|
||||
# If this layer is configured for string or integer output, we do not
|
||||
# create a vectorization layer (as the output is not vectorized).
|
||||
@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
return (keys.numpy(), values.numpy())
|
||||
|
||||
def _get_index_lookup_class(self):
|
||||
return index_lookup.IndexLookup
|
||||
return string_lookup.StringLookup
|
||||
|
||||
def _to_numpy(self, preprocessed_data):
|
||||
"""Converts preprocessed inputs into numpy arrays."""
|
||||
@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
def set_vocabulary(self,
|
||||
vocab,
|
||||
df_data=None,
|
||||
oov_df_value=None,
|
||||
append=False):
|
||||
oov_df_value=None):
|
||||
"""Sets vocabulary (and optionally document frequency) data for this layer.
|
||||
|
||||
This method sets the vocabulary and DF data for this layer directly, instead
|
||||
of analyzing a dataset through 'adapt'. It should be used whenever the vocab
|
||||
(and optionally document frequency) information is already known. If
|
||||
vocabulary data is already present in the layer, this method will either
|
||||
replace it, if 'append' is set to False, or append to it (if 'append' is set
|
||||
to True).
|
||||
vocabulary data is already present in the layer, this method will replace
|
||||
it.
|
||||
|
||||
Arguments:
|
||||
vocab: An array of string tokens.
|
||||
df_data: An array of document frequency data. Only necessary if the layer
|
||||
output_mode is TFIDF.
|
||||
oov_df_value: The document frequency of the OOV token. Only necessary if
|
||||
output_mode is TFIDF. OOV data is optional when appending additional
|
||||
data in TFIDF mode; if an OOV value is supplied it will overwrite the
|
||||
existing OOV value.
|
||||
append: Whether to overwrite or append any existing vocabulary data.
|
||||
output_mode is TFIDF.
|
||||
|
||||
Raises:
|
||||
ValueError: If there are too many inputs, the inputs do not match, or
|
||||
@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
"be changed after the layer is "
|
||||
"called.").format(mode=self._output_mode))
|
||||
|
||||
current_table_size = self._index_lookup_layer.vocab_size()
|
||||
self._index_lookup_layer.set_vocabulary(vocab, append)
|
||||
self._index_lookup_layer.set_vocabulary(vocab)
|
||||
|
||||
# When doing raw or integer output, we don't have a Vectorize layer to
|
||||
# manage. In this case, we can return directly.
|
||||
@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
return
|
||||
|
||||
if not self._pad_to_max or self._max_tokens is None:
|
||||
num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
|
||||
num_tokens = self._index_lookup_layer.vocab_size()
|
||||
self._vectorize_layer.set_num_elements(num_tokens)
|
||||
|
||||
# We're only _really_ appending if the table_size is nonzero. This is
|
||||
# important for some sanity checks in tfidf mode (specifically, checking if
|
||||
# oov_df_value is set or not) and handling existing tfidf weight data.
|
||||
append = append if current_table_size > 0 else False
|
||||
|
||||
if self._output_mode == TFIDF:
|
||||
if df_data is None:
|
||||
raise ValueError("df_data must be set if output_mode is TFIDF")
|
||||
@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
raise ValueError("df_data must be the same length as vocab. "
|
||||
"len(df_data) is %s, len(vocab) is %s" %
|
||||
(len(vocab), len(df_data)))
|
||||
if not append and oov_df_value is None:
|
||||
raise ValueError("You must pass an oov_df_value the first time "
|
||||
"'set_vocabulary' is called when output_mode is "
|
||||
if oov_df_value is None:
|
||||
raise ValueError("You must pass an oov_df_value when output_mode is "
|
||||
"TFIDF.")
|
||||
|
||||
df_data = self._convert_to_ndarray(df_data)
|
||||
if append:
|
||||
# The existing IDF data is stored in a Keras weight, so we can get it
|
||||
# by calling K.get_value() on the weight object. Take the first
|
||||
# table_size+1 values in case we're padding the weight with zeros
|
||||
existing_df_data = K.get_value(
|
||||
self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
|
||||
df_data = np.append(existing_df_data, df_data, axis=0)
|
||||
# If we are appending and need to replace the OOV DF value, we can
|
||||
# assign it over the existing OOV DF value at index 0 of the (already-
|
||||
# concatenated) DF value array.
|
||||
if oov_df_value is not None:
|
||||
df_data[0] = oov_df_value
|
||||
else:
|
||||
# If we are not appending (that is, we have only new data) we need to
|
||||
# insert the OOV value to the front of the array. (This is a append to
|
||||
# the head, not a replacement of the zeroth value.)
|
||||
if not isinstance(oov_df_value, np.ndarray):
|
||||
oov_df_value = np.array([oov_df_value])
|
||||
df_data = np.insert(df_data, 0, oov_df_value)
|
||||
if not isinstance(oov_df_value, np.ndarray):
|
||||
oov_df_value = np.array([oov_df_value])
|
||||
df_data = np.insert(df_data, 0, oov_df_value)
|
||||
self._vectorize_layer.set_tfidf_data(df_data)
|
||||
|
||||
def build(self, input_shape):
|
||||
@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
|
||||
if not self.built:
|
||||
raise RuntimeError("_set_state_variables() must be called after build().")
|
||||
if self._output_mode == TFIDF:
|
||||
self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
|
||||
updates[_OOV_IDF_NAME])
|
||||
self.set_vocabulary(
|
||||
updates[_VOCAB_NAME],
|
||||
updates[_IDF_NAME],
|
||||
updates[_OOV_IDF_NAME])
|
||||
else:
|
||||
self.set_vocabulary(updates[_VOCAB_NAME])
|
||||
|
||||
|
@ -619,25 +619,6 @@ class TextVectorizationOutputTest(
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllEqual(expected_output, output_dataset)
|
||||
|
||||
def test_vocab_appending(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
input_array = np.array([["earth", "wind", "and", "fire"],
|
||||
["fire", "and", "earth", "michigan"]])
|
||||
expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.INT)
|
||||
layer.set_vocabulary(vocab_data[0])
|
||||
layer.set_vocabulary(vocab_data[1], append=True)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllClose(expected_output, output_dataset)
|
||||
|
||||
def test_int_output_densifies_with_zeros(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
# Create an input array that has 5 elements in the first example and 4 in
|
||||
@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest(
|
||||
split=None,
|
||||
output_mode=text_vectorization.TFIDF,
|
||||
pad_to_max_tokens=True)
|
||||
layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
|
||||
layer.set_vocabulary(
|
||||
vocab_data,
|
||||
df_data=tfidf_data,
|
||||
oov_df_value=.05)
|
||||
int_data = layer(input_data)
|
||||
self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
|
||||
|
||||
@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest(
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllClose(expected_output, output_dataset)
|
||||
|
||||
def test_tfidf_appending(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
tfidf_data = [[.5, .25], [.2, .125]]
|
||||
input_array = np.array([["earth", "wind", "and", "earth"],
|
||||
["ohio", "fire", "earth", "michigan"]])
|
||||
|
||||
# pyformat: disable
|
||||
# pylint: disable=bad-whitespace
|
||||
expected_output = [[ 0, 1, .25, .2, 0],
|
||||
[.1, .5, 0, 0, .125]]
|
||||
# pylint: enable=bad-whitespace
|
||||
# pyformat: enable
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.TFIDF)
|
||||
layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
|
||||
layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllClose(expected_output, output_dataset)
|
||||
|
||||
def test_tfidf_appending_with_oov_replacement(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
tfidf_data = [[.5, .25], [.2, .125]]
|
||||
input_array = np.array([["earth", "wind", "and", "earth"],
|
||||
["ohio", "fire", "earth", "michigan"]])
|
||||
|
||||
# pyformat: disable
|
||||
# pylint: disable=bad-whitespace
|
||||
expected_output = [[ 0, 1, .25, .2, 0],
|
||||
[1.5, .5, 0, 0, .125]]
|
||||
# pylint: enable=bad-whitespace
|
||||
# pyformat: enable
|
||||
|
||||
input_data = keras.Input(shape=(None,), dtype=dtypes.string)
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.TFIDF)
|
||||
layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
|
||||
# Note that here we've replaced the OOV vaue.
|
||||
layer.set_vocabulary(
|
||||
vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
|
||||
int_data = layer(input_data)
|
||||
model = keras.Model(inputs=input_data, outputs=int_data)
|
||||
output_dataset = model.predict(input_array)
|
||||
self.assertAllClose(expected_output, output_dataset)
|
||||
|
||||
def test_accept_1D_input(self):
|
||||
input_array = np.array(["earth wind and fire",
|
||||
"fire and earth michigan"])
|
||||
@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
|
||||
"vocabulary larger than the maximum vocab.*"):
|
||||
layer.set_vocabulary(vocab_data)
|
||||
|
||||
def test_too_long_vocab_fails_in_multiple_settings(self):
|
||||
vocab_data = [["earth", "wind"], ["and", "fire"]]
|
||||
|
||||
layer = get_layer_class()(
|
||||
max_tokens=4,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.INT)
|
||||
|
||||
# The first time we call set_vocabulary, we're under the max_tokens limit
|
||||
# so it should be fine.
|
||||
layer.set_vocabulary(vocab_data[0])
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"vocabulary larger than the maximum vocab.*"):
|
||||
layer.set_vocabulary(vocab_data[1], append=True)
|
||||
|
||||
def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
|
||||
@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
|
||||
"You must pass an oov_df_value.*"):
|
||||
layer.set_vocabulary(vocab_data, df_data)
|
||||
|
||||
def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
df_data = [1, 2, 3, 4]
|
||||
layer = get_layer_class()(
|
||||
max_tokens=5,
|
||||
standardize=None,
|
||||
split=None,
|
||||
output_mode=text_vectorization.TFIDF)
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"You must pass an oov_df_value.*"):
|
||||
layer.set_vocabulary(vocab_data, df_data, append=True)
|
||||
|
||||
def test_set_tfidf_in_non_tfidf_fails(self):
|
||||
vocab_data = ["earth", "wind", "and", "fire"]
|
||||
df_data = [1, 2, 3, 4]
|
||||
|
@ -23,7 +23,7 @@ import numpy as np
|
||||
from tensorflow.python.keras import backend as K
|
||||
from tensorflow.python.keras.engine import base_preprocessing_layer_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
|
||||
from tensorflow.python.keras.layers.preprocessing import text_vectorization
|
||||
from tensorflow.python.ops.ragged import ragged_tensor_value
|
||||
from tensorflow.python.util.tf_export import keras_export
|
||||
@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
|
||||
return categorical_encoding_v1.CategoricalEncoding
|
||||
|
||||
def _get_index_lookup_class(self):
|
||||
return index_lookup_v1.IndexLookup
|
||||
return string_lookup_v1.StringLookup
|
||||
|
||||
def _to_numpy(self, data):
|
||||
"""Converts preprocessed inputs into numpy arrays."""
|
||||
|
@ -221,7 +221,7 @@ tf_class {
|
||||
}
|
||||
member_method {
|
||||
name: "set_vocabulary"
|
||||
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
|
||||
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "set_weights"
|
||||
|
@ -219,7 +219,7 @@ tf_class {
|
||||
}
|
||||
member_method {
|
||||
name: "set_vocabulary"
|
||||
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
|
||||
argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "set_weights"
|
||||
|
Loading…
Reference in New Issue
Block a user