diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD index bef294429bd..c1e1d5573e5 100644 --- a/tensorflow/python/keras/layers/preprocessing/BUILD +++ b/tensorflow/python/keras/layers/preprocessing/BUILD @@ -27,10 +27,12 @@ py_library( ":discretization", ":hashing", ":image_preprocessing", + ":integer_lookup", ":normalization", ":preprocessing_stage", ":preprocessing_test_utils", ":reduction", + ":string_lookup", ":text_vectorization", ], ) @@ -146,6 +148,20 @@ py_library( ], ) +py_library( + name = "integer_lookup", + srcs = [ + "integer_lookup.py", + "integer_lookup_v1.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":index_lookup", + ":table_utils", + "//tensorflow/python:dtypes", + ], +) + py_library( name = "table_utils", srcs = [ @@ -179,7 +195,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":categorical_encoding", - ":index_lookup", + ":string_lookup", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", "//tensorflow/python:dtypes", @@ -235,6 +251,20 @@ py_library( ], ) +py_library( + name = "string_lookup", + srcs = [ + "string_lookup.py", + "string_lookup_v1.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":index_lookup", + ":table_utils", + "//tensorflow/python:dtypes", + ], +) + py_library( name = "preprocessing_stage", srcs = [ @@ -442,6 +472,22 @@ tf_py_test( ], ) +tf_py_test( + name = "integer_lookup_test", + size = "medium", + srcs = ["integer_lookup_test.py"], + python_version = "PY3", + deps = [ + ":integer_lookup", + ":preprocessing_test_utils", + "//tensorflow/python:client_testlib", + "//tensorflow/python/keras", + "//tensorflow/python/keras/utils:generic_utils", + "//tensorflow/python/ops/ragged:ragged_string_ops", + "@absl_py//absl/testing:parameterized", + ], +) + distribute_py_test( name = "normalization_distribution_test", srcs = ["normalization_distribution_test.py"], @@ -517,6 +563,22 @@ tf_py_test( ], ) +tf_py_test( + name = "string_lookup_test", + size = "medium", + srcs = ["string_lookup_test.py"], + python_version = "PY3", + deps = [ + ":preprocessing_test_utils", + ":string_lookup", + "//tensorflow/python:client_testlib", + "//tensorflow/python/keras", + "//tensorflow/python/keras/utils:generic_utils", + "//tensorflow/python/ops/ragged:ragged_string_ops", + "@absl_py//absl/testing:parameterized", + ], +) + tf_py_test( name = "preprocessing_stage_test", srcs = ["preprocessing_stage_test.py"], diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py index d6c8a07c8ba..ba9b0d740e1 100644 --- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py +++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py @@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts" class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): - """Maps strings (or integers) from a vocabulary to integer indices. + """Maps values from a vocabulary to integer indices. - This layer translates a set of arbitrary strings or integers into an integer - output via a table-based lookup, with optional out-of-vocabulary handling. + This layer translates a set of arbitrary hashables into an integer output via + a table-based lookup, with optional out-of-vocabulary handling. This is the + basis layer for both IntegerLookup and IndexLookup; it holds the common + logic but is not intended to be exported as part of the Keras API. If desired, the user can call this layer's `adapt()` method on a data set, which will analyze the data set, determine the frequency of individual string - or integer values, and create a vocabulary from them. This vocabulary can have + values, and create a vocabulary from them. This vocabulary can have unlimited size or be capped, depending on the configuration options for this layer; if there are more unique values in the input than the maximum vocabulary size, the most frequent terms will be used to create the @@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): Attributes: max_tokens: The maximum size of the vocabulary for this layer. If None, - there is no cap on the size of the vocabulary. Note that the vocabulary - does include OOV buckets, so the effective number of unique values in the - vocabulary is `(max_tokens - num_oov_tokens)` when this value is set. - num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to - 1. If this value is more than 1, OOV inputs are hashed to determine their - OOV value; if this value is 0, passing an OOV input will result in a '-1' - being returned for that value in the output tensor. (Note that, because - the value is -1 and not 0, this will allow you to effectively drop OOV - values from categorical encodings.) - vocabulary: An optional list of vocabulary terms, or a path to a text file - containing a vocabulary to load into this layer. The file should contain - one token per line. In either case, the vocabulary must be unique; if - the list or file contains the same token multiple times, an error will - be thrown. Note that when passing a vocabulary - either as a list or as - a file - the vocabulary will not be present in the layer's config dict; - it will instead be a part of the layer's weights. - reserve_zero: Whether to reserve the index 0, which indicates pad values in - the Keras masking system. If True, the output of this layer will be in the - range `[1...max_tokens+1)`; if False, the output will be in the range - `[0...max_tokens)`. Defaults to True. - mask_zero: If True, input values of 0 (for integers) and `""` (for strings) - will be treated as masked values and assigned an output value of 0. If - this option is set, `reserve_zero` must also be set. Defaults to False. - Call arguments: - inputs: The data to look up. Can be a tf.Tensor or RaggedTensor. - invert: Controls the lookup direction. If False, the layer will map strings - to integers; if true, the layer will map integers to strings. Defaults - to False. + there is no cap on the size of the vocabulary. Note that this vocabulary + includes the OOV and mask tokens, so the effective number of tokens is + (max_tokens - num_oov_indices - (1 if mask_token else 0)) + num_oov_indices: The number of out-of-vocabulary tokens to use. If this + value is more than 1, OOV inputs are hashed to determine their OOV value; + if this value is 0, passing an OOV input will result in a '-1' being + returned for that value in the output tensor. (Note that, because the + value is -1 and not 0, this will allow you to effectively drop OOV values + from categorical encodings.) + mask_token: A token that represents masked values, and which is mapped to + index 0. If set to None, no mask term will be added and the OOV tokens, if + any, will be indexed from (0...num_oov_indices) instead of + (1...num_oov_indices+1). + oov_token: The token representing an out-of-vocabulary value. This token is + only used when performing an inverse lookup. + vocabulary: An optional list of vocabulary terms. If the list contains the + same token multiple times, an error will be thrown. """ # TODO(momernick): Add an examples section to the docstring. def __init__(self, - max_tokens=None, - num_oov_tokens=1, + max_tokens, + num_oov_indices, + mask_token, + oov_token, vocabulary=None, - reserve_zero=True, - mask_zero=False, **kwargs): - invert = False - if invert: - allowed_dtypes = [dtypes.int32, dtypes.int64] - else: - allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64] - - if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: - raise ValueError("TextVectorization may only have a dtype in %s." % - allowed_dtypes) - - if "dtype" not in kwargs: - kwargs["dtype"] = dtypes.int64 if invert else dtypes.string # If max_tokens is set, the value must be greater than 1 - otherwise we # are creating a 0-element vocab, which doesn't make sense. if max_tokens is not None and max_tokens <= 1: raise ValueError("If set, max_tokens must be greater than 1.") - if num_oov_tokens < 0: - raise ValueError("num_oov_tokens must be greater than 0. You passed %s" % - num_oov_tokens) + if num_oov_indices < 0: + raise ValueError("num_oov_indices must be greater than 0. You passed %s" % + num_oov_indices) - self.invert = invert self.max_tokens = max_tokens - self.num_oov_tokens = num_oov_tokens - self.reserve_zero = reserve_zero - self.mask_zero = mask_zero - - # We need to reserve at least num_oov_tokens tokens, plus one additional - # value if we are reserving the zero value in our output. - if reserve_zero: - self._reserved_values = (num_oov_tokens + 1) - else: - self._reserved_values = num_oov_tokens - - # We need to account for the OOV buckets in our vocabulary size. - if max_tokens is not None: - self._max_elements = max_tokens - num_oov_tokens - else: - self._max_elements = None + self.num_oov_indices = num_oov_indices + self.oov_token = oov_token + self.mask_token = mask_token # If there is only one OOV bucket, we can determine the OOV value (either 0 # or 1 depending on whether 0 is reserved) and set that as the default @@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): # do a further hashing step; to make this easier, we set the OOV value to # -1. (This lets us do a vectorized add and cast to boolean to determine # locations where we need to do extra hashing.) - if self.num_oov_tokens == 1: - self._oov_value = 1 if reserve_zero else 0 + if self.num_oov_indices == 1: + self._oov_value = 0 if mask_token is None else 1 else: self._oov_value = -1 super(IndexLookup, self).__init__( - combiner=_IndexLookupCombiner(self.max_tokens), **kwargs) + combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token), + **kwargs) + + self._output_dtype = dtypes.int64 - # If the layer's input type is int32, we can only output int32 values - - # MutableHashTable doesn't allow us to map int32->int64. - if self.dtype == dtypes.int32: - self._output_dtype = dtypes.int32 - else: - self._output_dtype = dtypes.int64 self._table = lookup_ops.MutableHashTable( key_dtype=self.dtype, value_dtype=self._output_dtype, @@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): # counting code in the Model object doesn't throw an attribute error. tracked_table.shape = tensor_shape.TensorShape((0,)) - if self.num_oov_tokens <= 1: - oov_tokens = None + if self.num_oov_indices <= 1: + oov_indices = None else: - oov_start = 1 if reserve_zero else 0 - oov_tokens = list(range(oov_start, self._reserved_values)) + oov_start = 1 if mask_token is not None else 0 + oov_end = oov_start + num_oov_indices + oov_indices = list(range(oov_start, oov_end)) self._table_handler = table_utils.TableHandler( table=self._table, - oov_tokens=oov_tokens, + oov_tokens=oov_indices, use_v1_apis=self._use_v1_apis()) if vocabulary is not None: - if isinstance(vocabulary, str): - vocabulary = table_utils.get_vocabulary_from_file(vocabulary) - table_utils.validate_vocabulary_is_unique(vocabulary) - self.set_vocabulary(vocabulary) def compute_output_shape(self, input_shape): return input_shape - def compute_output_signature(self, input_spec, invert=False): + def compute_output_signature(self, input_spec): output_shape = self.compute_output_shape(input_spec.shape.as_list()) - if invert: - output_dtype = dtypes.string - else: - output_dtype = dtypes.int64 + output_dtype = dtypes.int64 return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype) def adapt(self, data, reset_state=True): @@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): keys, values = self._table_handler.data() # This is required because the MutableHashTable doesn't preserve insertion # order, but we rely on the order of the array to assign indices. - if self.dtype == dtypes.string: - return [x.decode("utf-8") for _, x in sorted(zip(values, keys))] - else: - return [x for _, x in sorted(zip(values, keys))] + return [x for _, x in sorted(zip(values, keys))] def vocab_size(self): return self._table_handler.vocab_size() @@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): def get_config(self): config = { "max_tokens": self.max_tokens, - "num_oov_tokens": self.num_oov_tokens, - "vocabulary": None, - "reserve_zero": self.reserve_zero, - "mask_zero": self.mask_zero, + "num_oov_indices": self.num_oov_indices, + "oov_token": self.oov_token, + "mask_token": self.mask_token, } base_config = super(IndexLookup, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer): # abstraction for ease of saving!) we return 0. return 0 - def set_vocabulary(self, - vocab, - append=False): + def set_vocabulary(self, vocab): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab information is already known. If vocabulary data is already present in the - layer, this method will either replace it, if 'append' is set to False, or - append to it (if 'append' is set to True). + layer, this method will either replace it Arguments: vocab: An array of string tokens. - append: Whether to overwrite or append any existing vocabulary data. Raises: ValueError: If there are too many inputs, the inputs do not match, or input data is missing. """ - current_table_size = self._table_handler.vocab_size() - total_vocab_size = len(vocab) + (current_table_size if append else 0) - if self.max_tokens is not None and total_vocab_size > self._max_elements: + + table_utils.validate_vocabulary_is_unique(vocab) + + should_have_mask = self.mask_token is not None + if should_have_mask: + has_mask = vocab[0] == self.mask_token + oov_start = 1 + else: + has_mask = False + oov_start = 0 + + should_have_oov = self.num_oov_indices > 0 + if should_have_oov: + oov_end = oov_start + self.num_oov_indices + expected_oov = [self.oov_token] * self.num_oov_indices + has_oov = vocab[oov_start:oov_end] == expected_oov + # If we get a numpy array, then has_oov may end up being a numpy array + # instead of a bool. Fix this by collapsing the variable if it's not bool. + if not isinstance(has_oov, bool): + has_oov = any(has_oov) + else: + has_oov = False + + if all([should_have_mask, has_mask, should_have_oov]) and not has_oov: + raise ValueError("The passed vocabulary has the correct mask token `%s` " + "at index 0, but does not have the OOV token `%s` in " + "indices [%s:%s]. Instead, we found `%s`. Was this " + "vocabulary generated by a layer with incompatible " + "settings?" % + (self.mask_token, self.oov_token, oov_start, oov_end, + vocab[oov_start:oov_end])) + + if all([should_have_oov, has_oov, should_have_mask]) and not has_mask: + raise ValueError( + "The passed vocabulary has the correct OOV token `%s` at " + "indices [%s:%s], but does not have the mask token `%s` in " + "index 0. Instead, we found `%s`. Was this vocabulary " + "generated by a layer with incompatible settings?" % + (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0])) + + insert_special_tokens = not has_oov and not has_mask + + special_tokens = [] if self.mask_token is None else [self.mask_token] + special_tokens.extend([self.oov_token] * self.num_oov_indices) + + num_special_tokens = len(special_tokens) + tokens = vocab if insert_special_tokens else vocab[num_special_tokens:] + if self.mask_token in tokens: + raise ValueError("Reserved mask token %s was found in the passed " + "vocabulary at index %s. Please either remove the " + "reserved token from the vocabulary or change the " + "mask token for this layer." % + (self.mask_token, tokens.index(self.mask_token))) + if self.oov_token in tokens: + raise ValueError("Reserved OOV token %s was found in the passed " + "vocabulary at index %s. Please either remove the " + "reserved token from the vocabulary or change the " + "OOV token for this layer." % + (self.oov_token, tokens.index(self.oov_token))) + + if insert_special_tokens: + total_vocab_size = len(vocab) + num_special_tokens + else: + total_vocab_size = len(vocab) + if self.max_tokens is not None and total_vocab_size > self.max_tokens: raise ValueError( "Attempted to set a vocabulary larger than the maximum vocab size. " - "Passed vocab size is %s, max vocab size is %s. Note that the OOV " - "token(s) are automatically added to the number of tokens." % + "Passed vocab size is %s, max vocab size is %s." % (total_vocab_size, self.max_tokens)) - start_index = self._reserved_values + (current_table_size if append else 0) + start_index = num_special_tokens values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64) - vocab = table_utils.convert_to_ndarray(vocab, self.dtype) - table_utils.assert_same_type(self.dtype, vocab, "vocab") - values = table_utils.convert_to_ndarray(values, self._output_dtype) - table_utils.assert_same_type(self._output_dtype, values, "values") - - if not append and current_table_size > 0: - self._table_handler.clear() + self._table_handler.clear() self._table_handler.insert(vocab, values) + if insert_special_tokens and num_special_tokens > 0: + special_token_values = np.arange(num_special_tokens, dtype=np.int64) + self._table_handler.insert(special_tokens, special_token_values) + def _set_state_variables(self, updates): if not self.built: raise RuntimeError("_set_state_variables() must be called after build().") @@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner): dataset, all tokens are retained.s """ - def __init__(self, vocab_size=None): + def __init__(self, vocab_size=None, mask_value=None): self._vocab_size = vocab_size + self._mask_value = mask_value def compute(self, values, accumulator=None): """Compute a step in this computation, returning a new accumulator.""" - values = base_preprocessing_layer.convert_to_list(values) + values = base_preprocessing_layer.convert_to_list( + values, sparse_default_value=self._mask_value) if accumulator is None: accumulator = self._create_accumulator() # TODO(momernick): Benchmark improvements to this algorithm. - if isinstance(values, (str, bytes)): + if isinstance(values, (str, bytes, np.int64)): accumulator.count_dict[values] += 1 else: for document in values: @@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner): "vocab": A list of the retained items in the vocabulary. """ vocab_counts = accumulator.count_dict + if self._mask_value in vocab_counts: + del vocab_counts[self._mask_value] sorted_counts = sorted( vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True) vocab_data = ( diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py index 3360dad6ffe..098e67f5f6b 100644 --- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py +++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py @@ -65,7 +65,12 @@ class IndexLookupDistributionTest( with distribution.scope(): input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()() + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.adapt(vocab_dataset) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py index 3c5b5757ec2..a95834233b3 100644 --- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py +++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py @@ -21,7 +21,6 @@ from __future__ import print_function import itertools import os import random -import six import string from absl.testing import parameterized @@ -31,7 +30,6 @@ from tensorflow.python import keras from tensorflow.python import tf2 from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.distribute import one_device_strategy from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import sparse_tensor @@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util from tensorflow.python.keras.saving import save from tensorflow.python.keras.utils.generic_utils import CustomObjectScope from tensorflow.python.ops.ragged import ragged_factory_ops -from tensorflow.python.platform import gfile from tensorflow.python.platform import test @@ -71,6 +68,10 @@ def _get_end_to_end_test_cases(): ["and"], ["earth"], ["michigan"]]), "kwargs": { "max_tokens": None, + "num_oov_indices": 1, + "mask_token": "", + "oov_token": "[OOV]", + "dtype": dtypes.string, }, "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]], "input_dtype": @@ -91,6 +92,9 @@ def _get_end_to_end_test_cases(): dtype=np.int64), "kwargs": { "max_tokens": None, + "num_oov_indices": 1, + "mask_token": 0, + "oov_token": -1, "dtype": dtypes.int64, }, "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]], @@ -172,7 +176,12 @@ class CategoricalEncodingInputTest( expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -193,7 +202,12 @@ class CategoricalEncodingInputTest( expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) - layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64) + layer = get_layer_class()( + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=1, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -209,7 +223,12 @@ class CategoricalEncodingInputTest( expected_output = [[2, 3, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -223,7 +242,12 @@ class CategoricalEncodingInputTest( expected_output = [[2, 3, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) - layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64) + layer = get_layer_class()( + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=1, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest( expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True) - layer = get_layer_class()(max_tokens=None, num_oov_tokens=2) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=2, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest( input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) layer = get_layer_class()( - max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2) + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=2, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest( expected_output = [[3, 4, 6], [6, 5, 3, 2]] input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True) - layer = get_layer_class()(max_tokens=None, num_oov_tokens=2) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=2, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest( input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) layer = get_layer_class()( - max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2) + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=2, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest( dense_shape=[3, 4]) vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.adapt(vocab_dataset) - # Note that the expected vocabulary has a null string (''). This is because - # we assume that sparse tensors are in fact dense tensors with elided - # values, not ragged tensors. Therefore, we assume that any missing data - # is important and give it a spot in our vocab. - expected_vocabulary = ["", "michigan", "fire"] + expected_vocabulary = ["", "[OOV]", "michigan", "fire"] self.assertAllEqual(expected_vocabulary, layer.get_vocabulary()) def test_ragged_adapt(self): @@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest( ["fire", "michigan"]]) vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.adapt(vocab_dataset) - expected_vocabulary = ["michigan", "fire"] + expected_vocabulary = ["", "[OOV]", "michigan", "fire"] self.assertAllEqual(expected_vocabulary, layer.get_vocabulary()) def test_sparse_int_input(self): @@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest( expected_dense_shape = [3, 4] input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) - layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64) + layer = get_layer_class()( + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=1, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest( expected_output = [[2, 3, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest( expected_output = [[2, 3, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) - layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64) + layer = get_layer_class()( + max_tokens=None, + dtype=dtypes.int64, + num_oov_indices=1, + mask_token=0, + oov_token=-1) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest( batched_ds = ds.take(2) input_t = keras.Input(shape=(), dtype=dtypes.string) layer = get_layer_class()( - max_tokens=10, num_oov_tokens=0, reserve_zero=False) + max_tokens=10, + num_oov_indices=0, + mask_token=None, + oov_token=None, + dtype=dtypes.string) _ = layer(input_t) layer.adapt(batched_ds) -@keras_parameterized.run_all_keras_modes -class IndexLookupDistributionTest( - keras_parameterized.TestCase, - preprocessing_test_utils.PreprocessingLayerTest): - - def test_cpu_distribution(self): - vocab_data = ["earth", "wind", "and", "fire"] - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] - - strategy = one_device_strategy.OneDeviceStrategy("/cpu:0") - - with strategy.scope(): - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()() - layer.set_vocabulary(vocab_data) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllEqual(expected_output, output_dataset) - - @keras_parameterized.run_all_keras_modes class IndexLookupOutputTest(keras_parameterized.TestCase, preprocessing_test_utils.PreprocessingLayerTest): @@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()() + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, def test_output_shape(self): input_data = keras.Input(shape=(4,), dtype=dtypes.string) - layer = get_layer_class()() + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) int_data = layer(input_data) self.assertAllEqual(int_data.shape[1:], input_data.shape[1:]) @@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(reserve_zero=False) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=None, + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase, expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(vocabulary=vocab_data) + layer = get_layer_class()( + vocabulary=vocab_data, + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) @@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase, preprocessing_test_utils.PreprocessingLayerTest ): - def _write_to_temp_file(self, file_name, vocab_list): - vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt") - with gfile.GFile(vocab_path, "w") as writer: - for vocab in vocab_list: - writer.write(vocab + "\n") - writer.flush() - writer.close() - return vocab_path - def test_int_output_explicit_vocab(self): vocab_data = ["earth", "wind", "and", "fire"] input_array = np.array([["earth", "wind", "and", "fire"], @@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase, expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(vocabulary=vocab_data) + layer = get_layer_class()( + vocabulary=vocab_data, + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset) - def test_get_vocab_returns_str(self): - vocab_data = ["earth", "wind", "and", "fire"] - layer = get_layer_class()(vocabulary=vocab_data) - layer_vocab = layer.get_vocabulary() - self.assertAllEqual(vocab_data, layer_vocab) - self.assertIsInstance(layer_vocab[0], six.text_type) + def test_vocab_with_max_cap(self): + vocab_data = ["", "[OOV]", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=5, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) + layer.set_vocabulary(vocab_data) + returned_vocab = layer.get_vocabulary() + self.assertAllEqual(vocab_data, returned_vocab) - def test_int_output_explicit_vocab_from_file(self): - vocab_list = ["earth", "wind", "and", "fire"] - vocab_path = self._write_to_temp_file("vocab_file", vocab_list) - - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] - - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(vocabulary=vocab_path) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllEqual(expected_output, output_dataset) - - def test_vocab_appending(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] - - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=5) - layer.set_vocabulary(vocab_data[0]) - layer.set_vocabulary(vocab_data[1], append=True) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllClose(expected_output, output_dataset) + def test_int_vocab_with_max_cap(self): + vocab_data = [0, -1, 42, 1276, 1138] + layer = get_layer_class()( + max_tokens=5, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + layer.set_vocabulary(vocab_data) + returned_vocab = layer.get_vocabulary() + self.assertAllEqual(vocab_data, returned_vocab) def test_non_unique_vocab_fails(self): vocab_data = ["earth", "wind", "and", "fire", "fire"] with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"): - _ = get_layer_class()(vocabulary=vocab_data) + _ = get_layer_class()( + vocabulary=vocab_data, + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) - def test_non_unique_vocab_from_file_fails(self): - vocab_list = ["earth", "wind", "and", "fire", "earth"] - vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) + def test_vocab_with_oov_and_wrong_mask_fails(self): + vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) + with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"): + layer.set_vocabulary(vocab_data) + + def test_vocab_with_oov_and_no_mask_fails(self): + vocab_data = ["[OOV]", "earth", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) + with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"): + layer.set_vocabulary(vocab_data) + + def test_vocab_with_mask_but_no_oov_fails(self): + vocab_data = ["", "earth", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) + with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"): + layer.set_vocabulary(vocab_data) + + def test_vocab_with_repeated_element_fails(self): + vocab_data = ["earth", "earth", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"): - _ = get_layer_class()(vocabulary=vocab_path) + layer.set_vocabulary(vocab_data) + def test_vocab_with_reserved_oov_element_fails(self): + vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) + with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"): + layer.set_vocabulary(vocab_data) -@keras_parameterized.run_all_keras_modes -class InverseLookupOutputTest(keras_parameterized.TestCase, - preprocessing_test_utils.PreprocessingLayerTest): + def test_vocab_with_reserved_mask_element_fails(self): + vocab_data = ["earth", "mask_token", "wind", "and", "fire"] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="mask_token", + oov_token="[OOV]", + dtype=dtypes.string) + with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"): + layer.set_vocabulary(vocab_data) - def DISABLE_test_inverse_output(self): - vocab_data = ["earth", "wind", "and", "fire"] - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]] - # Note that the token 'michigan' has been replaced by ''. This is because - # 'michigan' is OOV for this layer. - expected_strings = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", ""]]) - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=None) - layer.set_vocabulary(vocab_data) - int_data = layer(input_data) - string_data = layer(int_data, invert=True) - model = keras.Model(inputs=input_data, outputs=[int_data, string_data]) - int_outputs, string_outputs = model.predict(input_array) - self.assertAllEqual(expected_ints, int_outputs) - self.assertAllEqual(expected_strings, string_outputs) + def test_non_unique_int_vocab_fails(self): + vocab_data = [12, 13, 14, 15, 15] + with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"): + _ = get_layer_class()( + vocabulary=vocab_data, + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) - def DISABLE_test_inverse_output_serialization(self): - vocab_data = ["earth", "wind", "and", "fire"] - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]] - # Note that the token 'michigan' has been replaced by ''. This is because - # 'michigan' is OOV for this layer. - expected_strings = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", ""]]) + def test_int_vocab_with_oov_and_wrong_mask_fails(self): + vocab_data = [1234, -1, 11, 21, 13, 14] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"): + layer.set_vocabulary(vocab_data) - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=None) - layer.set_vocabulary(vocab_data) - int_data = layer(input_data) - string_data = layer(int_data, invert=True) - model = keras.Model(inputs=input_data, outputs=[int_data, string_data]) + def test_int_vocab_with_oov_and_no_mask_fails(self): + vocab_data = [-1, 11, 12, 13, 14] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"): + layer.set_vocabulary(vocab_data) - with CustomObjectScope({"IndexLookup": get_layer_class()}): - new_model = keras.Model.from_config(model.get_config()) - new_model.set_weights(model.get_weights()) - int_outputs, string_outputs = new_model.predict(input_array) - self.assertAllEqual(expected_ints, int_outputs) - self.assertAllEqual(expected_strings, string_outputs) + def test_int_vocab_with_mask_but_no_oov_fails(self): + vocab_data = [0, 11, 12, 13, 14] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"): + layer.set_vocabulary(vocab_data) + + def test_int_vocab_with_repeated_element_fails(self): + vocab_data = [11, 11, 34, 23, 124] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"): + layer.set_vocabulary(vocab_data) + + def test_int_vocab_with_reserved_oov_element_fails(self): + vocab_data = [14, 38, -1, 34, 3, 84] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"): + layer.set_vocabulary(vocab_data) + + def test_int_vocab_with_reserved_mask_element_fails(self): + vocab_data = [125, 0, 3, 4, 94] + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token=0, + oov_token=-1, + dtype=dtypes.int64) + with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"): + layer.set_vocabulary(vocab_data) @keras_parameterized.run_all_keras_modes(always_skip_eager=True) @@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase, vocab_data = ["earth", "wind", "and", "fire"] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=10) + layer = get_layer_class()( + max_tokens=10, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase, vocab_data = ["earth", "wind", "and", "fire"] input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=10) + layer = get_layer_class()( + max_tokens=10, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase, def test_too_long_vocab_fails_in_single_setting(self): vocab_data = ["earth", "wind", "and", "fire"] - layer = get_layer_class()(max_tokens=4) + layer = get_layer_class()( + max_tokens=4, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) with self.assertRaisesRegex(ValueError, "vocabulary larger than the maximum vocab.*"): layer.set_vocabulary(vocab_data) - def test_too_long_vocab_fails_in_multiple_settings(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - layer = get_layer_class()(max_tokens=4) - - # The first time we call set_vocabulary, we're under the max_tokens - # so it should be fine. - layer.set_vocabulary(vocab_data[0]) - with self.assertRaisesRegex(ValueError, - "vocabulary larger than the maximum vocab.*"): - layer.set_vocabulary(vocab_data[1], append=True) - def test_zero_max_tokens_fails(self): with self.assertRaisesRegex(ValueError, ".*max_tokens.*"): - _ = get_layer_class()(max_tokens=0) + _ = get_layer_class()( + max_tokens=0, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) @keras_parameterized.run_all_keras_modes @@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase, # Build and validate a golden model. input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()(max_tokens=None) + layer = get_layer_class()( + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + dtype=dtypes.string) layer.set_vocabulary(vocab_data) int_data = layer(input_data) model = keras.Model(inputs=input_data, outputs=int_data) @@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase, @keras_parameterized.run_all_keras_modes -class IndexLookupCombinerTest(keras_parameterized.TestCase, - preprocessing_test_utils.PreprocessingLayerTest): +class IndexLookupStringCombinerTest( + keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): def compare_text_accumulators(self, a, b, msg=None): if a is None or b is None: @@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase, self.validate_accumulator_extract(combiner, data, expected_extract_output) +@keras_parameterized.run_all_keras_modes +class IndexLookupIntCombinerTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest + ): + + def compare_text_accumulators(self, a, b, msg=None): + if a is None or b is None: + self.assertAllEqual(a, b, msg=msg) + + self.assertAllEqual(a.count_dict, b.count_dict, msg=msg) + + compare_accumulators = compare_text_accumulators + + def update_accumulator(self, accumulator, data): + accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"]))) + + return accumulator + + def test_combiner_api_compatibility_int_mode(self): + data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]]) + combiner = index_lookup._IndexLookupCombiner() + expected_accumulator_output = { + "vocab": np.array([1138, 725, 42, 1729, 203]), + "counts": np.array([2, 2, 2, 1, 1]), + } + expected_extract_output = { + "vocab": np.array([1138, 725, 42, 1729, 203]), + } + expected_accumulator = combiner._create_accumulator() + expected_accumulator = self.update_accumulator(expected_accumulator, + expected_accumulator_output) + self.validate_accumulator_serialize_and_deserialize(combiner, data, + expected_accumulator) + self.validate_accumulator_uniqueness(combiner, data) + self.validate_accumulator_extract(combiner, data, expected_extract_output) + + # TODO(askerryryan): Add tests confirming equivalence to behavior of + # existing tf.keras.preprocessing.text.Tokenizer. + @parameterized.named_parameters( + { + "testcase_name": "top_k_smaller_than_full_vocab", + "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]), + "vocab_size": 3, + "expected_accumulator_output": { + "vocab": np.array([1138, 1729, 725, 42]), + "counts": np.array([3, 2, 1, 1]), + }, + "expected_extract_output": { + "vocab": np.array([1138, 1729, 725]), + }, + }, + { + "testcase_name": "top_k_larger_than_full_vocab", + "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]), + "vocab_size": 10, + "expected_accumulator_output": { + "vocab": np.array([1138, 1729, 725, 42]), + "counts": np.array([3, 2, 1, 1]), + }, + "expected_extract_output": { + "vocab": np.array([1138, 1729, 725, 42]), + }, + }, + { + "testcase_name": "no_top_k", + "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]), + "vocab_size": None, + "expected_accumulator_output": { + "vocab": np.array([1138, 1729, 725, 42]), + "counts": np.array([3, 2, 1, 1]), + }, + "expected_extract_output": { + "vocab": np.array([1138, 1729, 725, 42]), + }, + }, + { + "testcase_name": "single_element_per_row", + "data": np.array([[42], [1138], [1729], [1138], [725]]), + "vocab_size": 3, + "expected_accumulator_output": { + "vocab": np.array([1138, 1729, 725, 42]), + "counts": np.array([2, 1, 1, 1]), + }, + "expected_extract_output": { + "vocab": np.array([1138, 1729, 725]), + }, + }, + # Which tokens are retained are based on global frequency, and thus are + # sensitive to frequency within a document. In contrast, because idf only + # considers the presence of a token in a document, it is insensitive + # to the frequency of the token within the document. + { + "testcase_name": + "retained_tokens_sensitive_to_within_document_frequency", + "data": + np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138], + [725, 203]]), + "vocab_size": + 3, + "expected_accumulator_output": { + "vocab": np.array([1138, 42, 1729, 725, 203]), + "counts": np.array([4, 2, 2, 1, 1]), + }, + "expected_extract_output": { + "vocab": np.array([1138, 1729, 42]), + }, + }) + def test_combiner_computation(self, data, vocab_size, + expected_accumulator_output, + expected_extract_output): + combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size) + expected_accumulator = combiner._create_accumulator() + expected_accumulator = self.update_accumulator(expected_accumulator, + expected_accumulator_output) + self.validate_accumulator_computation(combiner, data, expected_accumulator) + self.validate_accumulator_extract(combiner, data, expected_extract_output) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py new file mode 100644 index 00000000000..671c02573db --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py @@ -0,0 +1,112 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras string lookup preprocessing layer.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import dtypes +from tensorflow.python.keras.layers.preprocessing import index_lookup +from tensorflow.python.keras.layers.preprocessing import table_utils + + +class IntegerLookup(index_lookup.IndexLookup): + """Maps integers from a vocabulary to integer indices. + + This layer translates a set of arbitrary integers into an integer output via a + table-based lookup, with optional out-of-vocabulary handling. + + If desired, the user can call this layer's `adapt()` method on a data set, + which will analyze the data set, determine the frequency of individual string + values, and create a vocabulary from them. This vocabulary can have + unlimited size or be capped, depending on the configuration options for this + layer; if there are more unique values in the input than the maximum + vocabulary size, the most frequent terms will be used to create the + vocabulary. + + Attributes: + max_values: The maximum size of the vocabulary for this layer. If None, + there is no cap on the size of the vocabulary. Note that this vocabulary + includes the OOV and mask tokens, so the effective number of tokens is + (max_tokens - num_oov_tokens - (1 if mask_token else 0)) + num_oov_indices: The number of out-of-vocabulary values to use; defaults to + 1. If this value is more than 1, OOV inputs are hashed to determine their + OOV value; if this value is 0, passing an OOV input will result in a '-1' + being returned for that value in the output tensor. (Note that, because + the value is -1 and not 0, this will allow you to effectively drop OOV + values from categorical encodings.) + mask_value: A value that represents masked inputs, and which is mapped to + index 0. Defaults to 0. If set to None, no mask term will be added and the + OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of + (1...num_oov_tokens+1). + oov_value: The value representing an out-of-vocabulary value. Defaults to + -1. + vocabulary: An optional list of values, or a path to a text file containing + a vocabulary to load into this layer. The file should contain one value + per line. If the list or file contains the same token multiple times, an + error will be thrown. + """ + + def __init__(self, + max_values=None, + num_oov_indices=1, + mask_value=0, + oov_value=-1, + vocabulary=None, + **kwargs): + allowed_dtypes = [dtypes.int64] + + if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: + raise ValueError("IntegerLookup may only have a dtype in %s." % + allowed_dtypes) + + if "dtype" not in kwargs: + kwargs["dtype"] = dtypes.int64 + + # If max_values is set, the value must be greater than 1 - otherwise we + # are creating a 0-element vocab, which doesn't make sense. + if max_values is not None and max_values <= 1: + raise ValueError("If set, max_values must be greater than 1.") + + if num_oov_indices < 0: + raise ValueError("num_oov_indices must be greater than 0. You passed %s" % + num_oov_indices) + + if vocabulary is not None: + if isinstance(vocabulary, str): + vocabulary = table_utils.get_vocabulary_from_file(vocabulary) + vocabulary = [int(v) for v in vocabulary] + + super(IntegerLookup, self).__init__( + max_tokens=max_values, + num_oov_indices=num_oov_indices, + mask_token=mask_value, + oov_token=oov_value, + vocabulary=vocabulary, + **kwargs) + + def get_config(self): + base_config = super(IntegerLookup, self).get_config() + # Because the super config has a bunch of args we're also passing, + # we need to rename and remove them from the config dict. + base_config["max_values"] = base_config["max_tokens"] + del base_config["max_tokens"] + + base_config["mask_value"] = base_config["mask_token"] + del base_config["mask_token"] + + base_config["oov_value"] = base_config["oov_token"] + del base_config["oov_token"] + return base_config diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py new file mode 100644 index 00000000000..515a1ca6667 --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py @@ -0,0 +1,501 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Keras text vectorization preprocessing layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import itertools +import os +import random + +from absl.testing import parameterized +import numpy as np + +from tensorflow.python import keras +from tensorflow.python import tf2 + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import testing_utils +from tensorflow.python.keras.layers.preprocessing import integer_lookup +from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1 +from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils +from tensorflow.python.keras.saving import save +from tensorflow.python.keras.utils.generic_utils import CustomObjectScope +from tensorflow.python.ops.ragged import ragged_factory_ops +from tensorflow.python.platform import gfile +from tensorflow.python.platform import test + + +def get_layer_class(): + if context.executing_eagerly(): + return integer_lookup.IntegerLookup + else: + return integer_lookup_v1.IntegerLookup + + +def _get_end_to_end_test_cases(): + test_cases = ( + { + "testcase_name": + "test_ints_soft_vocab_cap", + # Create an array where 1138 is the most frequent term, followed by + # 1729, then 725, then 42. This ensures that the vocab accumulator + # is sorting by frequency. + "vocab_data": + np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729], + [1729], [725], [725]], + dtype=np.int64), + "input_data": + np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]], + dtype=np.int64), + "kwargs": { + "max_values": None, + "dtype": dtypes.int64, + }, + "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]], + "input_dtype": + dtypes.int64 + },) + + crossed_test_cases = [] + # Cross above test cases with use_dataset in (True, False) + for use_dataset in (True, False): + for case in test_cases: + case = case.copy() + if use_dataset: + case["testcase_name"] = case["testcase_name"] + "_with_dataset" + case["use_dataset"] = use_dataset + crossed_test_cases.append(case) + + return crossed_test_cases + + +@keras_parameterized.run_all_keras_modes +class IntegerLookupLayerTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + @parameterized.named_parameters(*_get_end_to_end_test_cases()) + def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs, + use_dataset, expected_output, + input_dtype): + cls = get_layer_class() + expected_output_dtype = dtypes.int64 + input_shape = input_data.shape + + if use_dataset: + # Keras APIs expect batched datasets. + # TODO(rachelim): `model.predict` predicts the result on each + # dataset batch separately, then tries to concatenate the results + # together. When the results have different shapes on the non-concat + # axis (which can happen in the output_mode = INT case for + # IntegerLookup), the concatenation fails. In real use cases, this may + # not be an issue because users are likely to pipe the preprocessing layer + # into other keras layers instead of predicting it directly. A workaround + # for these unit tests is to have the dataset only contain one batch, so + # no concatenation needs to happen with the result. For consistency with + # numpy input, we should make `predict` join differently shaped results + # together sensibly, with 0 padding. + input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch( + input_shape[0]) + vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch( + input_shape[0]) + + with CustomObjectScope({"IntegerLookup": cls}): + output_data = testing_utils.layer_test( + cls, + kwargs=kwargs, + input_shape=input_shape, + input_data=input_data, + input_dtype=input_dtype, + expected_output_dtype=expected_output_dtype, + validate_training=False, + adapt_data=vocab_data) + self.assertAllClose(expected_output, output_data) + + +@keras_parameterized.run_all_keras_modes +class CategoricalEncodingInputTest( + keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_sparse_int_input(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 2]], + values=np.array([13, 32], dtype=np.int64), + dense_shape=[3, 4]) + + expected_indices = [[0, 0], [1, 2]] + expected_values = [5, 1] + expected_dense_shape = [3, 4] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) + layer = get_layer_class()(max_values=None) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_data = model.predict(input_array, steps=1) + self.assertAllEqual(expected_indices, output_data.indices) + self.assertAllEqual(expected_values, output_data.values) + self.assertAllEqual(expected_dense_shape, output_data.dense_shape) + + def test_ragged_int_input(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]], + dtype=np.int64) + expected_output = [[2, 3, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) + layer = get_layer_class()(max_values=None) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + +@keras_parameterized.run_all_keras_modes +class CategoricalEncodingMultiOOVTest( + keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_sparse_int_input_multi_bucket(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 2]], + values=np.array([13, 133], dtype=np.int64), + dense_shape=[3, 4]) + + expected_indices = [[0, 0], [1, 2]] + expected_values = [6, 2] + expected_dense_shape = [3, 4] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) + layer = get_layer_class()( + max_values=None, + dtype=dtypes.int64, + num_oov_indices=2, + mask_value=0, + oov_value=-1) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_data = model.predict(input_array, steps=1) + self.assertAllEqual(expected_indices, output_data.indices) + self.assertAllEqual(expected_values, output_data.values) + self.assertAllEqual(expected_dense_shape, output_data.dense_shape) + + def test_ragged_int_input_multi_bucket(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]], + dtype=np.int64) + expected_output = [[3, 4, 6], [6, 5, 3, 2]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) + layer = get_layer_class()(max_values=None, num_oov_indices=2) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + +@keras_parameterized.run_all_keras_modes +class CategoricalEncodingAdaptTest( + keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_sparse_adapt(self): + vocab_data = sparse_tensor.SparseTensor( + indices=[[0, 0], [0, 1], [1, 2]], + values=[203, 1729, 203], + dense_shape=[3, 4]) + vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data) + + layer = get_layer_class()() + layer.adapt(vocab_dataset) + expected_vocabulary = [0, -1, 203, 1729] + self.assertAllEqual(expected_vocabulary, layer.get_vocabulary()) + + def test_ragged_adapt(self): + vocab_data = ragged_factory_ops.constant([[203], [1729, 203]]) + vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data) + + layer = get_layer_class()() + layer.adapt(vocab_dataset) + expected_vocabulary = [0, -1, 203, 1729] + self.assertAllEqual(expected_vocabulary, layer.get_vocabulary()) + + def test_sparse_int_input(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 2]], + values=np.array([13, 32], dtype=np.int64), + dense_shape=[3, 4]) + + expected_indices = [[0, 0], [1, 2]] + expected_values = [5, 1] + expected_dense_shape = [3, 4] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True) + layer = get_layer_class()(max_values=None) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_data = model.predict(input_array, steps=1) + self.assertAllEqual(expected_indices, output_data.indices) + self.assertAllEqual(expected_values, output_data.values) + self.assertAllEqual(expected_dense_shape, output_data.dense_shape) + + def test_ragged_int_input(self): + vocab_data = np.array([10, 11, 12, 13], dtype=np.int64) + input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]], + dtype=np.int64) + expected_output = [[2, 3, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True) + layer = get_layer_class()(max_values=None) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_single_int_generator_dataset(self): + + def word_gen(): + for _ in itertools.count(1): + yield random.randint(0, 100) + + ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64, + tensor_shape.TensorShape([])) + batched_ds = ds.take(2) + input_t = keras.Input(shape=(), dtype=dtypes.int64) + layer = get_layer_class()( + max_values=10, num_oov_indices=0, mask_value=None, oov_value=None) + _ = layer(input_t) + layer.adapt(batched_ds) + + +@keras_parameterized.run_all_keras_modes +class IntegerLookupOutputTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_int_output(self): + vocab_data = [42, 1138, 725, 1729] + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()() + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_output_shape(self): + input_data = keras.Input(shape=(4,), dtype=dtypes.int64) + layer = get_layer_class()(max_values=None, num_oov_indices=1) + int_data = layer(input_data) + self.assertAllEqual(int_data.shape[1:], input_data.shape[1:]) + + def test_int_output_no_reserved_zero(self): + vocab_data = [42, 1138, 725, 1729] + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(max_values=None, mask_value=None) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_int_output_explicit_vocab(self): + vocab_data = [42, 1138, 725, 1729] + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()( + vocabulary=vocab_data, + max_values=None, + ) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + +@keras_parameterized.run_all_keras_modes +class IntegerLookupVocabularyTest( + keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def _write_to_temp_file(self, file_name, vocab_list): + vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt") + with gfile.GFile(vocab_path, "w") as writer: + for vocab in vocab_list: + writer.write(str(vocab) + "\n") + writer.flush() + writer.close() + return vocab_path + + def test_int_output_explicit_vocab(self): + vocab_data = [42, 1138, 725, 1729] + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(vocabulary=vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_get_vocab_returns_int(self): + vocab_data = [42, 1138, 725, 1729] + expected_vocab = [0, -1, 42, 1138, 725, 1729] + layer = get_layer_class()(vocabulary=vocab_data) + layer_vocab = layer.get_vocabulary() + self.assertAllEqual(expected_vocab, layer_vocab) + self.assertIsInstance(layer_vocab[0], np.int64) + + def test_int_output_explicit_vocab_from_file(self): + vocab_list = [42, 1138, 725, 1729] + vocab_path = self._write_to_temp_file("vocab_file", vocab_list) + + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(vocabulary=vocab_path) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_non_unique_vocab_fails(self): + vocab_data = [42, 1138, 725, 1729, 1729] + with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"): + _ = get_layer_class()(vocabulary=vocab_data) + + def test_non_unique_vocab_from_file_fails(self): + vocab_list = [42, 1138, 725, 1729, 42] + vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) + with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"): + _ = get_layer_class()(vocabulary=vocab_path) + + +@keras_parameterized.run_all_keras_modes(always_skip_eager=True) +class IntegerLookupSaveableTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest + ): + + def test_ops_are_not_added_with_multiple_get_set_weights(self): + vocab_data = [42, 1138, 725, 1729] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(max_values=10) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + weights = model.get_weights() + model.set_weights(weights) + keras.backend.get_session().graph.finalize() + weights = model.get_weights() + model.set_weights(weights) + + def test_layer_saving_with_h5(self): + vocab_data = [42, 1138, 725, 1729] + + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(max_values=10) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + path = os.path.join(self.get_temp_dir(), "model") + with self.assertRaisesRegex(NotImplementedError, + "Save or restore weights that is not.*"): + save.save_model(model, path, save_format="h5") + + +@keras_parameterized.run_all_keras_modes +class IntegerLookupErrorTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_too_long_vocab_fails_in_single_setting(self): + vocab_data = [42, 1138, 725, 1729] + + layer = get_layer_class()(max_values=4, num_oov_indices=1) + with self.assertRaisesRegex(ValueError, + "vocabulary larger than the maximum vocab.*"): + layer.set_vocabulary(vocab_data) + + def test_zero_max_values_fails(self): + with self.assertRaisesRegex(ValueError, ".*max_values.*"): + _ = get_layer_class()(max_values=0, num_oov_indices=1) + + +@keras_parameterized.run_all_keras_modes +class IntegerLookupSavingTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_vocabulary_persistence_across_saving(self): + vocab_data = [42, 1138, 725, 1729] + input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + # Build and validate a golden model. + input_data = keras.Input(shape=(None,), dtype=dtypes.int64) + layer = get_layer_class()(max_values=None, num_oov_indices=1) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(output_dataset, expected_output) + + # Save the model to disk. + output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model") + model.save(output_path, save_format="tf") + + # Delete the session and graph to ensure that the loaded model is generated + # from scratch. + # TODO(b/149526183): Can't clear session when TF2 is disabled. + if tf2.enabled(): + keras.backend.clear_session() + + loaded_model = keras.models.load_model( + output_path, custom_objects={"IntegerLookup": get_layer_class()}) + + # Ensure that the loaded model is unique (so that the save/load is real) + self.assertIsNot(model, loaded_model) + + # Validate correctness of the new model. + new_output_dataset = loaded_model.predict(input_array) + self.assertAllEqual(new_output_dataset, expected_output) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py new file mode 100644 index 00000000000..ec326f4d78b --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py @@ -0,0 +1,25 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras string lookup preprocessing layer.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.keras.layers.preprocessing import index_lookup_v1 +from tensorflow.python.keras.layers.preprocessing import integer_lookup + + +class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup): + pass diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py new file mode 100644 index 00000000000..4032486b5f0 --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py @@ -0,0 +1,106 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras string lookup preprocessing layer.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import dtypes +from tensorflow.python.keras.layers.preprocessing import index_lookup +from tensorflow.python.keras.layers.preprocessing import table_utils + + +class StringLookup(index_lookup.IndexLookup): + """Maps strings from a vocabulary to integer indices. + + This layer translates a set of arbitrary strings into an integer output via a + table-based lookup, with optional out-of-vocabulary handling. + + If desired, the user can call this layer's `adapt()` method on a data set, + which will analyze the data set, determine the frequency of individual string + values, and create a vocabulary from them. This vocabulary can have + unlimited size or be capped, depending on the configuration options for this + layer; if there are more unique values in the input than the maximum + vocabulary size, the most frequent terms will be used to create the + vocabulary. + + Attributes: + max_tokens: The maximum size of the vocabulary for this layer. If None, + there is no cap on the size of the vocabulary. Note that this vocabulary + includes the OOV and mask tokens, so the effective number of tokens is + (max_tokens - num_oov_indices - (1 if mask_token else 0)) + num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to + 1. If this value is more than 1, OOV inputs are hashed to determine their + OOV value; if this value is 0, passing an OOV input will result in a '-1' + being returned for that value in the output tensor. (Note that, because + the value is -1 and not 0, this will allow you to effectively drop OOV + values from categorical encodings.) + mask_token: A token that represents masked values, and which is mapped to + index 0. Defaults to the empty string "". If set to None, no mask term + will be added and the OOV tokens, if any, will be indexed from + (0...num_oov_indices) instead of (1...num_oov_indices+1). + oov_token: The token representing an out-of-vocabulary value. Defaults to + "[OOV]". + vocabulary: An optional list of vocabulary terms, or a path to a text file + containing a vocabulary to load into this layer. The file should contain + one token per line. If the list or file contains the same token multiple + times, an error will be thrown. + encoding: The Python string encoding to use. Defaults to `'utf-8'`. + """ + + def __init__(self, + max_tokens=None, + num_oov_indices=1, + mask_token="", + oov_token="[OOV]", + vocabulary=None, + encoding="utf-8", + **kwargs): + allowed_dtypes = [dtypes.string] + + if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes: + raise ValueError("StringLookup may only have a dtype in %s." % + allowed_dtypes) + + if "dtype" not in kwargs: + kwargs["dtype"] = dtypes.string + + if vocabulary is not None: + if isinstance(vocabulary, str): + vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding) + + self.encoding = encoding + + super(StringLookup, self).__init__( + max_tokens=max_tokens, + num_oov_indices=num_oov_indices, + mask_token=mask_token, + oov_token=oov_token, + vocabulary=vocabulary, + **kwargs) + + def get_config(self): + config = {"encoding": self.encoding} + base_config = super(StringLookup, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def get_vocabulary(self): + if self._table_handler.vocab_size() == 0: + return [] + + keys, values = self._table_handler.data() + # This is required because the MutableHashTable doesn't preserve insertion + # order, but we rely on the order of the array to assign indices. + return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))] diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py new file mode 100644 index 00000000000..b2a610ac328 --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py @@ -0,0 +1,224 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Keras text vectorization preprocessing layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from absl.testing import parameterized +import numpy as np +import six + +from tensorflow.python import keras + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context +from tensorflow.python.framework import dtypes +from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import testing_utils +from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils +from tensorflow.python.keras.layers.preprocessing import string_lookup +from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 +from tensorflow.python.keras.saving import save +from tensorflow.python.keras.utils.generic_utils import CustomObjectScope +from tensorflow.python.platform import gfile +from tensorflow.python.platform import test + + +def get_layer_class(): + if context.executing_eagerly(): + return string_lookup.StringLookup + else: + return string_lookup_v1.StringLookup + + +def _get_end_to_end_test_cases(): + test_cases = ( + { + "testcase_name": + "test_strings_soft_vocab_cap", + # Create an array where 'earth' is the most frequent term, followed by + # 'wind', then 'and', then 'fire'. This ensures that the vocab + # accumulator is sorting by frequency. + "vocab_data": + np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"], + ["wind"], ["wind"], ["wind"], ["and"], ["and"]]), + "input_data": + np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"], + ["and"], ["earth"], ["michigan"]]), + "kwargs": { + "max_tokens": None, + }, + "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]], + "input_dtype": + dtypes.string + }, + ) + + crossed_test_cases = [] + # Cross above test cases with use_dataset in (True, False) + for use_dataset in (True, False): + for case in test_cases: + case = case.copy() + if use_dataset: + case["testcase_name"] = case["testcase_name"] + "_with_dataset" + case["use_dataset"] = use_dataset + crossed_test_cases.append(case) + + return crossed_test_cases + + +@keras_parameterized.run_all_keras_modes +class StringLookupLayerTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + @parameterized.named_parameters(*_get_end_to_end_test_cases()) + def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs, + use_dataset, expected_output, + input_dtype): + cls = get_layer_class() + expected_output_dtype = dtypes.int64 + input_shape = input_data.shape + + if use_dataset: + # Keras APIs expect batched datasets. + # TODO(rachelim): `model.predict` predicts the result on each + # dataset batch separately, then tries to concatenate the results + # together. When the results have different shapes on the non-concat + # axis (which can happen in the output_mode = INT case for + # StringLookup), the concatenation fails. In real use cases, this may + # not be an issue because users are likely to pipe the preprocessing layer + # into other keras layers instead of predicting it directly. A workaround + # for these unit tests is to have the dataset only contain one batch, so + # no concatenation needs to happen with the result. For consistency with + # numpy input, we should make `predict` join differently shaped results + # together sensibly, with 0 padding. + input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch( + input_shape[0]) + vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch( + input_shape[0]) + + with CustomObjectScope({"StringLookup": cls}): + output_data = testing_utils.layer_test( + cls, + kwargs=kwargs, + input_shape=input_shape, + input_data=input_data, + input_dtype=input_dtype, + expected_output_dtype=expected_output_dtype, + validate_training=False, + adapt_data=vocab_data) + self.assertAllClose(expected_output, output_data) + + +@keras_parameterized.run_all_keras_modes +class StringLookupVocabularyTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest + ): + + def _write_to_temp_file(self, file_name, vocab_list): + vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt") + with gfile.GFile(vocab_path, "w") as writer: + for vocab in vocab_list: + writer.write(vocab + "\n") + writer.flush() + writer.close() + return vocab_path + + def test_int_output_explicit_vocab(self): + vocab_data = ["earth", "wind", "and", "fire"] + input_array = np.array([["earth", "wind", "and", "fire"], + ["fire", "and", "earth", "michigan"]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.string) + layer = get_layer_class()(vocabulary=vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_get_vocab_returns_str(self): + vocab_data = ["earth", "wind", "and", "fire"] + expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"] + layer = get_layer_class()(vocabulary=vocab_data) + layer_vocab = layer.get_vocabulary() + self.assertAllEqual(expected_vocab, layer_vocab) + self.assertIsInstance(layer_vocab[0], six.text_type) + + def test_int_output_explicit_vocab_from_file(self): + vocab_list = ["earth", "wind", "and", "fire"] + vocab_path = self._write_to_temp_file("vocab_file", vocab_list) + + input_array = np.array([["earth", "wind", "and", "fire"], + ["fire", "and", "earth", "michigan"]]) + expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] + + input_data = keras.Input(shape=(None,), dtype=dtypes.string) + layer = get_layer_class()(vocabulary=vocab_path) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + output_dataset = model.predict(input_array) + self.assertAllEqual(expected_output, output_dataset) + + def test_non_unique_vocab_fails(self): + vocab_data = ["earth", "wind", "and", "fire", "fire"] + with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"): + _ = get_layer_class()(vocabulary=vocab_data) + + def test_non_unique_vocab_from_file_fails(self): + vocab_list = ["earth", "wind", "and", "fire", "earth"] + vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list) + with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"): + _ = get_layer_class()(vocabulary=vocab_path) + + +@keras_parameterized.run_all_keras_modes(always_skip_eager=True) +class StringLookupSaveableTest(keras_parameterized.TestCase, + preprocessing_test_utils.PreprocessingLayerTest): + + def test_ops_are_not_added_with_multiple_get_set_weights(self): + vocab_data = ["earth", "wind", "and", "fire"] + + input_data = keras.Input(shape=(None,), dtype=dtypes.string) + layer = get_layer_class()(max_tokens=10) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + weights = model.get_weights() + model.set_weights(weights) + keras.backend.get_session().graph.finalize() + weights = model.get_weights() + model.set_weights(weights) + + def test_layer_saving_with_h5(self): + vocab_data = ["earth", "wind", "and", "fire"] + + input_data = keras.Input(shape=(None,), dtype=dtypes.string) + layer = get_layer_class()(max_tokens=10) + layer.set_vocabulary(vocab_data) + int_data = layer(input_data) + model = keras.Model(inputs=input_data, outputs=int_data) + path = os.path.join(self.get_temp_dir(), "model") + with self.assertRaisesRegex(NotImplementedError, + "Save or restore weights that is not.*"): + save.save_model(model, path, save_format="h5") + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py new file mode 100644 index 00000000000..0d4c70de655 --- /dev/null +++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py @@ -0,0 +1,25 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras string lookup preprocessing layer.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.keras.layers.preprocessing import index_lookup_v1 +from tensorflow.python.keras.layers.preprocessing import string_lookup + + +class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup): + pass diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py index f5397da1f3e..05447f6e9ff 100644 --- a/tensorflow/python/keras/layers/preprocessing/table_utils.py +++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py @@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None): if np.can_cast(array.dtype, np_dtype): array = array.astype(np_dtype, casting="safe") return array - diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py index 119e0b5ccff..4156ba50c02 100644 --- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py +++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py @@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer from tensorflow.python.keras.layers.preprocessing import categorical_encoding -from tensorflow.python.keras.layers.preprocessing import index_lookup +from tensorflow.python.keras.layers.preprocessing import string_lookup from tensorflow.python.keras.utils import layer_utils from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer): self._max_tokens = max_tokens - # In INT mode, we have two reserved values (PAD and OOV). However, non-INT - # modes don't have a PAD value, so we only need to reserve one value. - self._reserved_values = 2 if output_mode == INT else 1 - # In INT mode, the zero value is reserved for padding (per Keras standard # padding approaches). In non-INT modes, there is no padding so we can set # the OOV value to zero instead of one. @@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer): self._max_vocab_size, compute_idf=output_mode == TFIDF), **kwargs) - reserve_zero = output_mode in [None, INT] + mask_token = "" if output_mode in [None, INT] else None self._index_lookup_layer = self._get_index_lookup_class()( - max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string) + max_tokens=max_tokens, mask_token=mask_token) # If this layer is configured for string or integer output, we do not # create a vectorization layer (as the output is not vectorized). @@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer): return (keys.numpy(), values.numpy()) def _get_index_lookup_class(self): - return index_lookup.IndexLookup + return string_lookup.StringLookup def _to_numpy(self, preprocessed_data): """Converts preprocessed inputs into numpy arrays.""" @@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer): def set_vocabulary(self, vocab, df_data=None, - oov_df_value=None, - append=False): + oov_df_value=None): """Sets vocabulary (and optionally document frequency) data for this layer. This method sets the vocabulary and DF data for this layer directly, instead of analyzing a dataset through 'adapt'. It should be used whenever the vocab (and optionally document frequency) information is already known. If - vocabulary data is already present in the layer, this method will either - replace it, if 'append' is set to False, or append to it (if 'append' is set - to True). + vocabulary data is already present in the layer, this method will replace + it. Arguments: vocab: An array of string tokens. df_data: An array of document frequency data. Only necessary if the layer output_mode is TFIDF. oov_df_value: The document frequency of the OOV token. Only necessary if - output_mode is TFIDF. OOV data is optional when appending additional - data in TFIDF mode; if an OOV value is supplied it will overwrite the - existing OOV value. - append: Whether to overwrite or append any existing vocabulary data. + output_mode is TFIDF. Raises: ValueError: If there are too many inputs, the inputs do not match, or @@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer): "be changed after the layer is " "called.").format(mode=self._output_mode)) - current_table_size = self._index_lookup_layer.vocab_size() - self._index_lookup_layer.set_vocabulary(vocab, append) + self._index_lookup_layer.set_vocabulary(vocab) # When doing raw or integer output, we don't have a Vectorize layer to # manage. In this case, we can return directly. @@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer): return if not self._pad_to_max or self._max_tokens is None: - num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values + num_tokens = self._index_lookup_layer.vocab_size() self._vectorize_layer.set_num_elements(num_tokens) - # We're only _really_ appending if the table_size is nonzero. This is - # important for some sanity checks in tfidf mode (specifically, checking if - # oov_df_value is set or not) and handling existing tfidf weight data. - append = append if current_table_size > 0 else False - if self._output_mode == TFIDF: if df_data is None: raise ValueError("df_data must be set if output_mode is TFIDF") @@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer): raise ValueError("df_data must be the same length as vocab. " "len(df_data) is %s, len(vocab) is %s" % (len(vocab), len(df_data))) - if not append and oov_df_value is None: - raise ValueError("You must pass an oov_df_value the first time " - "'set_vocabulary' is called when output_mode is " + if oov_df_value is None: + raise ValueError("You must pass an oov_df_value when output_mode is " "TFIDF.") df_data = self._convert_to_ndarray(df_data) - if append: - # The existing IDF data is stored in a Keras weight, so we can get it - # by calling K.get_value() on the weight object. Take the first - # table_size+1 values in case we're padding the weight with zeros - existing_df_data = K.get_value( - self._vectorize_layer.tf_idf_weights)[:current_table_size + 1] - df_data = np.append(existing_df_data, df_data, axis=0) - # If we are appending and need to replace the OOV DF value, we can - # assign it over the existing OOV DF value at index 0 of the (already- - # concatenated) DF value array. - if oov_df_value is not None: - df_data[0] = oov_df_value - else: - # If we are not appending (that is, we have only new data) we need to - # insert the OOV value to the front of the array. (This is a append to - # the head, not a replacement of the zeroth value.) - if not isinstance(oov_df_value, np.ndarray): - oov_df_value = np.array([oov_df_value]) - df_data = np.insert(df_data, 0, oov_df_value) + if not isinstance(oov_df_value, np.ndarray): + oov_df_value = np.array([oov_df_value]) + df_data = np.insert(df_data, 0, oov_df_value) self._vectorize_layer.set_tfidf_data(df_data) def build(self, input_shape): @@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer): if not self.built: raise RuntimeError("_set_state_variables() must be called after build().") if self._output_mode == TFIDF: - self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME], - updates[_OOV_IDF_NAME]) + self.set_vocabulary( + updates[_VOCAB_NAME], + updates[_IDF_NAME], + updates[_OOV_IDF_NAME]) else: self.set_vocabulary(updates[_VOCAB_NAME]) diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py index d8325f39149..f8a1f5b9434 100644 --- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py +++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py @@ -619,25 +619,6 @@ class TextVectorizationOutputTest( output_dataset = model.predict(input_array) self.assertAllEqual(expected_output, output_dataset) - def test_vocab_appending(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - input_array = np.array([["earth", "wind", "and", "fire"], - ["fire", "and", "earth", "michigan"]]) - expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]] - - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()( - max_tokens=5, - standardize=None, - split=None, - output_mode=text_vectorization.INT) - layer.set_vocabulary(vocab_data[0]) - layer.set_vocabulary(vocab_data[1], append=True) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllClose(expected_output, output_dataset) - def test_int_output_densifies_with_zeros(self): vocab_data = ["earth", "wind", "and", "fire"] # Create an input array that has 5 elements in the first example and 4 in @@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest( split=None, output_mode=text_vectorization.TFIDF, pad_to_max_tokens=True) - layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05) + layer.set_vocabulary( + vocab_data, + df_data=tfidf_data, + oov_df_value=.05) int_data = layer(input_data) self.assertAllEqual(expected_output_shape, int_data.shape.as_list()) @@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest( output_dataset = model.predict(input_array) self.assertAllClose(expected_output, output_dataset) - def test_tfidf_appending(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - tfidf_data = [[.5, .25], [.2, .125]] - input_array = np.array([["earth", "wind", "and", "earth"], - ["ohio", "fire", "earth", "michigan"]]) - - # pyformat: disable - # pylint: disable=bad-whitespace - expected_output = [[ 0, 1, .25, .2, 0], - [.1, .5, 0, 0, .125]] - # pylint: enable=bad-whitespace - # pyformat: enable - - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()( - max_tokens=5, - standardize=None, - split=None, - output_mode=text_vectorization.TFIDF) - layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05) - layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllClose(expected_output, output_dataset) - - def test_tfidf_appending_with_oov_replacement(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - tfidf_data = [[.5, .25], [.2, .125]] - input_array = np.array([["earth", "wind", "and", "earth"], - ["ohio", "fire", "earth", "michigan"]]) - - # pyformat: disable - # pylint: disable=bad-whitespace - expected_output = [[ 0, 1, .25, .2, 0], - [1.5, .5, 0, 0, .125]] - # pylint: enable=bad-whitespace - # pyformat: enable - - input_data = keras.Input(shape=(None,), dtype=dtypes.string) - layer = get_layer_class()( - max_tokens=5, - standardize=None, - split=None, - output_mode=text_vectorization.TFIDF) - layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05) - # Note that here we've replaced the OOV vaue. - layer.set_vocabulary( - vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True) - int_data = layer(input_data) - model = keras.Model(inputs=input_data, outputs=int_data) - output_dataset = model.predict(input_array) - self.assertAllClose(expected_output, output_dataset) - def test_accept_1D_input(self): input_array = np.array(["earth wind and fire", "fire and earth michigan"]) @@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase, "vocabulary larger than the maximum vocab.*"): layer.set_vocabulary(vocab_data) - def test_too_long_vocab_fails_in_multiple_settings(self): - vocab_data = [["earth", "wind"], ["and", "fire"]] - - layer = get_layer_class()( - max_tokens=4, - standardize=None, - split=None, - output_mode=text_vectorization.INT) - - # The first time we call set_vocabulary, we're under the max_tokens limit - # so it should be fine. - layer.set_vocabulary(vocab_data[0]) - with self.assertRaisesRegex(ValueError, - "vocabulary larger than the maximum vocab.*"): - layer.set_vocabulary(vocab_data[1], append=True) - def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self): vocab_data = ["earth", "wind", "and", "fire"] @@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase, "You must pass an oov_df_value.*"): layer.set_vocabulary(vocab_data, df_data) - def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self): - vocab_data = ["earth", "wind", "and", "fire"] - df_data = [1, 2, 3, 4] - layer = get_layer_class()( - max_tokens=5, - standardize=None, - split=None, - output_mode=text_vectorization.TFIDF) - with self.assertRaisesRegex(ValueError, - "You must pass an oov_df_value.*"): - layer.set_vocabulary(vocab_data, df_data, append=True) - def test_set_tfidf_in_non_tfidf_fails(self): vocab_data = ["earth", "wind", "and", "fire"] df_data = [1, 2, 3, 4] diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py index b869bee52ab..59cf2c61288 100644 --- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py +++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py @@ -23,7 +23,7 @@ import numpy as np from tensorflow.python.keras import backend as K from tensorflow.python.keras.engine import base_preprocessing_layer_v1 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1 -from tensorflow.python.keras.layers.preprocessing import index_lookup_v1 +from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 from tensorflow.python.keras.layers.preprocessing import text_vectorization from tensorflow.python.ops.ragged import ragged_tensor_value from tensorflow.python.util.tf_export import keras_export @@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization, return categorical_encoding_v1.CategoricalEncoding def _get_index_lookup_class(self): - return index_lookup_v1.IndexLookup + return string_lookup_v1.StringLookup def _to_numpy(self, data): """Converts preprocessed inputs into numpy arrays.""" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt index 47852865558..4f5b0f480e4 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt @@ -221,7 +221,7 @@ tf_class { } member_method { name: "set_vocabulary" - argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " + argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "set_weights" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt index 05154268354..a33f65189fd 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt @@ -219,7 +219,7 @@ tf_class { } member_method { name: "set_vocabulary" - argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], " + argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "set_weights"