Split index_lookup into string_lookup and integer_lookup.

PiperOrigin-RevId: 311651579 Change-Id: Ie033727dbe1026a7c7a88e4b31653840a17ac3d1
2020-05-14 19:07:07 -07:00 · 2020-05-14 19:07:07 -07:00 · efa3fb28d9
commit efa3fb28d9
parent 0de7edf8b1
16 changed files with 1658 additions and 439 deletions
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@ -27,10 +27,12 @@ py_library(
        ":discretization",
        ":hashing",
        ":image_preprocessing",
+        ":integer_lookup",
        ":normalization",
        ":preprocessing_stage",
        ":preprocessing_test_utils",
        ":reduction",
+        ":string_lookup",
        ":text_vectorization",
    ],
 )
@ -146,6 +148,20 @@ py_library(
    ],
 )

+py_library(
+    name = "integer_lookup",
+    srcs = [
+        "integer_lookup.py",
+        "integer_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
    name = "table_utils",
    srcs = [
@ -179,7 +195,7 @@ py_library(
    srcs_version = "PY2AND3",
    deps = [
        ":categorical_encoding",
-        ":index_lookup",
+        ":string_lookup",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:control_flow_ops",
        "//tensorflow/python:dtypes",
@ -235,6 +251,20 @@ py_library(
    ],
 )

+py_library(
+    name = "string_lookup",
+    srcs = [
+        "string_lookup.py",
+        "string_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
    name = "preprocessing_stage",
    srcs = [
@ -442,6 +472,22 @@ tf_py_test(
    ],
 )

+tf_py_test(
+    name = "integer_lookup_test",
+    size = "medium",
+    srcs = ["integer_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":integer_lookup",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
    name = "normalization_distribution_test",
    srcs = ["normalization_distribution_test.py"],
@ -517,6 +563,22 @@ tf_py_test(
    ],
 )

+tf_py_test(
+    name = "string_lookup_test",
+    size = "medium",
+    srcs = ["string_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":preprocessing_test_utils",
+        ":string_lookup",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
    name = "preprocessing_stage_test",
    srcs = ["preprocessing_stage_test.py"],
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"


 class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Maps strings (or integers) from a vocabulary to integer indices.
+  """Maps values from a vocabulary to integer indices.

-  This layer translates a set of arbitrary strings or integers into an integer
-  output via a table-based lookup, with optional out-of-vocabulary handling.
+  This layer translates a set of arbitrary hashables into an integer output via
+  a table-based lookup, with optional out-of-vocabulary handling. This is the
+  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  logic but is not intended to be exported as part of the Keras API.

  If desired, the user can call this layer's `adapt()` method on a data set,
  which will analyze the data set, determine the frequency of individual string
-  or integer values, and create a vocabulary from them. This vocabulary can have
+  values, and create a vocabulary from them. This vocabulary can have
  unlimited size or be capped, depending on the configuration options for this
  layer; if there are more unique values in the input than the maximum
  vocabulary size, the most frequent terms will be used to create the
@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):

  Attributes:
    max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that the vocabulary
-      does include OOV buckets, so the effective number of unique values in the
-      vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
-    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
-    reserve_zero: Whether to reserve the index 0, which indicates pad values in
-      the Keras masking system. If True, the output of this layer will be in the
-      range `[1...max_tokens+1)`; if False, the output will be in the range
-      `[0...max_tokens)`. Defaults to True.
-    mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
-      will be treated as masked values and assigned an output value of 0. If
-      this option is set, `reserve_zero` must also be set. Defaults to False.
-  Call arguments:
-    inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
-    invert: Controls the lookup direction. If False, the layer will map strings
-      to integers; if true, the layer will map integers to strings. Defaults
-      to False.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+      value is more than 1, OOV inputs are hashed to determine their OOV value;
+      if this value is 0, passing an OOV input will result in a '-1' being
+      returned for that value in the output tensor. (Note that, because the
+      value is -1 and not 0, this will allow you to effectively drop OOV values
+      from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. If set to None, no mask term will be added and the OOV tokens, if
+      any, will be indexed from (0...num_oov_indices) instead of
+      (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. This token is
+      only used when performing an inverse lookup.
+    vocabulary: An optional list of vocabulary terms. If the list contains the
+      same token multiple times, an error will be thrown.
  """
  # TODO(momernick): Add an examples section to the docstring.

  def __init__(self,
-               max_tokens=None,
-               num_oov_tokens=1,
+               max_tokens,
+               num_oov_indices,
+               mask_token,
+               oov_token,
               vocabulary=None,
-               reserve_zero=True,
-               mask_zero=False,
               **kwargs):
-    invert = False
-    if invert:
-      allowed_dtypes = [dtypes.int32, dtypes.int64]
-    else:
-      allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
-
-    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError("TextVectorization may only have a dtype in %s." %
-                       allowed_dtypes)
-
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.int64 if invert else dtypes.string

    # If max_tokens is set, the value must be greater than 1 - otherwise we
    # are creating a 0-element vocab, which doesn't make sense.
    if max_tokens is not None and max_tokens <= 1:
      raise ValueError("If set, max_tokens must be greater than 1.")

-    if num_oov_tokens < 0:
-      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
-                       num_oov_tokens)
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)

-    self.invert = invert
    self.max_tokens = max_tokens
-    self.num_oov_tokens = num_oov_tokens
-    self.reserve_zero = reserve_zero
-    self.mask_zero = mask_zero
-
-    # We need to reserve at least num_oov_tokens tokens, plus one additional
-    # value if we are reserving the zero value in our output.
-    if reserve_zero:
-      self._reserved_values = (num_oov_tokens + 1)
-    else:
-      self._reserved_values = num_oov_tokens
-
-    # We need to account for the OOV buckets in our vocabulary size.
-    if max_tokens is not None:
-      self._max_elements = max_tokens - num_oov_tokens
-    else:
-      self._max_elements = None
+    self.num_oov_indices = num_oov_indices
+    self.oov_token = oov_token
+    self.mask_token = mask_token

    # If there is only one OOV bucket, we can determine the OOV value (either 0
    # or 1 depending on whether 0 is reserved) and set that as the default
@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
    # do a further hashing step; to make this easier, we set the OOV value to
    # -1. (This lets us do a vectorized add and cast to boolean to determine
    # locations where we need to do extra hashing.)
-    if self.num_oov_tokens == 1:
-      self._oov_value = 1 if reserve_zero else 0
+    if self.num_oov_indices == 1:
+      self._oov_value = 0 if mask_token is None else 1
    else:
      self._oov_value = -1

    super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
+        combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
+        **kwargs)
+
+    self._output_dtype = dtypes.int64

-    # If the layer's input type is int32, we can only output int32 values -
-    # MutableHashTable doesn't allow us to map int32->int64.
-    if self.dtype == dtypes.int32:
-      self._output_dtype = dtypes.int32
-    else:
-      self._output_dtype = dtypes.int64
    self._table = lookup_ops.MutableHashTable(
        key_dtype=self.dtype,
        value_dtype=self._output_dtype,
@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
    # counting code in the Model object doesn't throw an attribute error.
    tracked_table.shape = tensor_shape.TensorShape((0,))

-    if self.num_oov_tokens <= 1:
-      oov_tokens = None
+    if self.num_oov_indices <= 1:
+      oov_indices = None
    else:
-      oov_start = 1 if reserve_zero else 0
-      oov_tokens = list(range(oov_start, self._reserved_values))
+      oov_start = 1 if mask_token is not None else 0
+      oov_end = oov_start + num_oov_indices
+      oov_indices = list(range(oov_start, oov_end))

    self._table_handler = table_utils.TableHandler(
        table=self._table,
-        oov_tokens=oov_tokens,
+        oov_tokens=oov_indices,
        use_v1_apis=self._use_v1_apis())

    if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
-      table_utils.validate_vocabulary_is_unique(vocabulary)
-
      self.set_vocabulary(vocabulary)

  def compute_output_shape(self, input_shape):
    return input_shape

-  def compute_output_signature(self, input_spec, invert=False):
+  def compute_output_signature(self, input_spec):
    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if invert:
-      output_dtype = dtypes.string
-    else:
-      output_dtype = dtypes.int64
+    output_dtype = dtypes.int64
    return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)

  def adapt(self, data, reset_state=True):
@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
    keys, values = self._table_handler.data()
    # This is required because the MutableHashTable doesn't preserve insertion
    # order, but we rely on the order of the array to assign indices.
-    if self.dtype == dtypes.string:
-      return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
-    else:
-      return [x for _, x in sorted(zip(values, keys))]
+    return [x for _, x in sorted(zip(values, keys))]

  def vocab_size(self):
    return self._table_handler.vocab_size()
@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
  def get_config(self):
    config = {
        "max_tokens": self.max_tokens,
-        "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
-        "reserve_zero": self.reserve_zero,
-        "mask_zero": self.mask_zero,
+        "num_oov_indices": self.num_oov_indices,
+        "oov_token": self.oov_token,
+        "mask_token": self.mask_token,
    }
    base_config = super(IndexLookup, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))
@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
    # abstraction for ease of saving!) we return 0.
    return 0

-  def set_vocabulary(self,
-                     vocab,
-                     append=False):
+  def set_vocabulary(self, vocab):
    """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary for this layer directly, instead of
    analyzing a dataset through 'adapt'. It should be used whenever the vocab
    information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it, if 'append' is set to False, or
-    append to it (if 'append' is set to True).
+    layer, this method will either replace it

    Arguments:
      vocab: An array of string tokens.
-      append: Whether to overwrite or append any existing vocabulary data.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
        input data is missing.
    """
-    current_table_size = self._table_handler.vocab_size()
-    total_vocab_size = len(vocab) + (current_table_size if append else 0)
-    if self.max_tokens is not None and total_vocab_size > self._max_elements:
+
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    if should_have_mask:
+      has_mask = vocab[0] == self.mask_token
+      oov_start = 1
+    else:
+      has_mask = False
+      oov_start = 0
+
+    should_have_oov = self.num_oov_indices > 0
+    if should_have_oov:
+      oov_end = oov_start + self.num_oov_indices
+      expected_oov = [self.oov_token] * self.num_oov_indices
+      has_oov = vocab[oov_start:oov_end] == expected_oov
+      # If we get a numpy array, then has_oov may end up being a numpy array
+      # instead of a bool. Fix this by collapsing the variable if it's not bool.
+      if not isinstance(has_oov, bool):
+        has_oov = any(has_oov)
+    else:
+      has_oov = False
+
+    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
+      raise ValueError("The passed vocabulary has the correct mask token `%s` "
+                       "at index 0, but does not have the OOV token `%s` in "
+                       "indices [%s:%s]. Instead, we found `%s`. Was this "
+                       "vocabulary generated by a layer with incompatible "
+                       "settings?" %
+                       (self.mask_token, self.oov_token, oov_start, oov_end,
+                        vocab[oov_start:oov_end]))
+
+    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
+      raise ValueError(
+          "The passed vocabulary has the correct OOV token `%s` at "
+          "indices [%s:%s], but does not have the mask token `%s` in "
+          "index 0. Instead, we found `%s`. Was this vocabulary "
+          "generated by a layer with incompatible settings?" %
+          (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
+
+    insert_special_tokens = not has_oov and not has_mask
+
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+    special_tokens.extend([self.oov_token] * self.num_oov_indices)
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+    if self.oov_token in tokens:
+      raise ValueError("Reserved OOV token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "OOV token for this layer." %
+                       (self.oov_token, tokens.index(self.oov_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
      raise ValueError(
          "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is %s, max vocab size is %s. Note that the OOV "
-          "token(s) are automatically added to the number of tokens." %
+          "Passed vocab size is %s, max vocab size is %s." %
          (total_vocab_size, self.max_tokens))

-    start_index = self._reserved_values + (current_table_size if append else 0)
+    start_index = num_special_tokens
    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
-    table_utils.assert_same_type(self.dtype, vocab, "vocab")

-    values = table_utils.convert_to_ndarray(values, self._output_dtype)
-    table_utils.assert_same_type(self._output_dtype, values, "values")
-
-    if not append and current_table_size > 0:
-      self._table_handler.clear()
+    self._table_handler.clear()
    self._table_handler.insert(vocab, values)

+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_tokens, special_token_values)
+
  def _set_state_variables(self, updates):
    if not self.built:
      raise RuntimeError("_set_state_variables() must be called after build().")
@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
      dataset, all tokens are retained.s
  """

-  def __init__(self, vocab_size=None):
+  def __init__(self, vocab_size=None, mask_value=None):
    self._vocab_size = vocab_size
+    self._mask_value = mask_value

  def compute(self, values, accumulator=None):
    """Compute a step in this computation, returning a new accumulator."""
-    values = base_preprocessing_layer.convert_to_list(values)
+    values = base_preprocessing_layer.convert_to_list(
+        values, sparse_default_value=self._mask_value)

    if accumulator is None:
      accumulator = self._create_accumulator()

    # TODO(momernick): Benchmark improvements to this algorithm.
-    if isinstance(values, (str, bytes)):
+    if isinstance(values, (str, bytes, np.int64)):
      accumulator.count_dict[values] += 1
    else:
      for document in values:
@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
        "vocab": A list of the retained items in the vocabulary.
    """
    vocab_counts = accumulator.count_dict
+    if self._mask_value in vocab_counts:
+      del vocab_counts[self._mask_value]
    sorted_counts = sorted(
        vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
    vocab_data = (
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@ -65,7 +65,12 @@ class IndexLookupDistributionTest(

    with distribution.scope():
      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
+      layer = get_layer_class()(
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
      layer.adapt(vocab_dataset)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@ -21,7 +21,6 @@ from __future__ import print_function
 import itertools
 import os
 import random
-import six
 import string

 from absl.testing import parameterized
@ -31,7 +30,6 @@ from tensorflow.python import keras
 from tensorflow.python import tf2

 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test


@ -71,6 +68,10 @@ def _get_end_to_end_test_cases():
                        ["and"], ["earth"], ["michigan"]]),
          "kwargs": {
              "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
          },
          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
          "input_dtype":
@ -91,6 +92,9 @@ def _get_end_to_end_test_cases():
                       dtype=np.int64),
          "kwargs": {
              "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
              "dtype": dtypes.int64,
          },
          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@ -172,7 +176,12 @@ class CategoricalEncodingInputTest(
    expected_dense_shape = [3, 4]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -193,7 +202,12 @@ class CategoricalEncodingInputTest(
    expected_dense_shape = [3, 4]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -209,7 +223,12 @@ class CategoricalEncodingInputTest(
    expected_output = [[2, 3, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -223,7 +242,12 @@ class CategoricalEncodingInputTest(
    expected_output = [[2, 3, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest(
    expected_dense_shape = [3, 4]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest(

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
    layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest(
    expected_output = [[3, 4, 6], [6, 5, 3, 2]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest(

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
    layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest(
        dense_shape=[3, 4])
    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)

-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.adapt(vocab_dataset)
-    # Note that the expected vocabulary has a null string (''). This is because
-    # we assume that sparse tensors are in fact dense tensors with elided
-    # values, not ragged tensors. Therefore, we assume that any missing data
-    # is important and give it a spot in our vocab.
-    expected_vocabulary = ["", "michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())

  def test_ragged_adapt(self):
@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest(
                                              ["fire", "michigan"]])
    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)

-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.adapt(vocab_dataset)
-    expected_vocabulary = ["michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())

  def test_sparse_int_input(self):
@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest(
    expected_dense_shape = [3, 4]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest(
    expected_output = [[2, 3, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest(
    expected_output = [[2, 3, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest(
    batched_ds = ds.take(2)
    input_t = keras.Input(shape=(), dtype=dtypes.string)
    layer = get_layer_class()(
-        max_tokens=10, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=10,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token=None,
+        dtype=dtypes.string)
    _ = layer(input_t)
    layer.adapt(batched_ds)


-@keras_parameterized.run_all_keras_modes
-class IndexLookupDistributionTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_cpu_distribution(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
-      layer.set_vocabulary(vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-
@keras_parameterized.run_all_keras_modes
 class IndexLookupOutputTest(keras_parameterized.TestCase,
                            preprocessing_test_utils.PreprocessingLayerTest):
@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,

  def test_output_shape(self):
    input_data = keras.Input(shape=(4,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    int_data = layer(input_data)
    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])

@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(reserve_zero=False)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_dataset = model.predict(input_array)
@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
                                preprocessing_test_utils.PreprocessingLayerTest
                               ):

-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
  def test_int_output_explicit_vocab(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    input_array = np.array([["earth", "wind", "and", "fire"],
@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
    output_dataset = model.predict(input_array)
    self.assertAllEqual(expected_output, output_dataset)

-  def test_get_vocab_returns_str(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = get_layer_class()(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], six.text_type)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)

-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=5)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)

  def test_non_unique_vocab_fails(self):
    vocab_data = ["earth", "wind", "and", "fire", "fire"]
    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)

-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+  def test_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = ["", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_path)
+      layer.set_vocabulary(vocab_data)

+  def test_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)

-@keras_parameterized.run_all_keras_modes
-class InverseLookupOutputTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)

-  def DISABLE_test_inverse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
-    int_outputs, string_outputs = model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64)

-  def DISABLE_test_inverse_output_serialization(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
+  def test_int_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = [1234, -1, 11, 21, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)

-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
+  def test_int_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = [-1, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)

-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    new_model.set_weights(model.get_weights())
-    int_outputs, string_outputs = new_model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_int_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = [0, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = [14, 38, -1, 34, 3, 84]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = [125, 0, 3, 4, 94]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)


@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
    vocab_data = ["earth", "wind", "and", "fire"]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
    vocab_data = ["earth", "wind", "and", "fire"]

    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
  def test_too_long_vocab_fails_in_single_setting(self):
    vocab_data = ["earth", "wind", "and", "fire"]

-    layer = get_layer_class()(max_tokens=4)
+    layer = get_layer_class()(
+        max_tokens=4,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    with self.assertRaisesRegex(ValueError,
                                "vocabulary larger than the maximum vocab.*"):
      layer.set_vocabulary(vocab_data)

-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    layer = get_layer_class()(max_tokens=4)
-
-    # The first time we call set_vocabulary, we're under the max_tokens
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
  def test_zero_max_tokens_fails(self):
    with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = get_layer_class()(max_tokens=0)
+      _ = get_layer_class()(
+          max_tokens=0,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)


@keras_parameterized.run_all_keras_modes
@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,

    # Build and validate a golden model.
    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
    layer.set_vocabulary(vocab_data)
    int_data = layer(input_data)
    model = keras.Model(inputs=input_data, outputs=int_data)
@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,


@keras_parameterized.run_all_keras_modes
-class IndexLookupCombinerTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupStringCombinerTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):

  def compare_text_accumulators(self, a, b, msg=None):
    if a is None or b is None:
@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
    self.validate_accumulator_extract(combiner, data, expected_extract_output)


+@keras_parameterized.run_all_keras_modes
+class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def compare_text_accumulators(self, a, b, msg=None):
+    if a is None or b is None:
+      self.assertAllEqual(a, b, msg=msg)
+
+    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
+
+  compare_accumulators = compare_text_accumulators
+
+  def update_accumulator(self, accumulator, data):
+    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
+
+    return accumulator
+
+  def test_combiner_api_compatibility_int_mode(self):
+    data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
+    combiner = index_lookup._IndexLookupCombiner()
+    expected_accumulator_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+        "counts": np.array([2, 2, 2, 1, 1]),
+    }
+    expected_extract_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+    }
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_serialize_and_deserialize(combiner, data,
+                                                        expected_accumulator)
+    self.validate_accumulator_uniqueness(combiner, data)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+  # TODO(askerryryan): Add tests confirming equivalence to behavior of
+  # existing tf.keras.preprocessing.text.Tokenizer.
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "top_k_smaller_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      {
+          "testcase_name": "top_k_larger_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 10,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "no_top_k",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": None,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "single_element_per_row",
+          "data": np.array([[42], [1138], [1729], [1138], [725]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([2, 1, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      # Which tokens are retained are based on global frequency, and thus are
+      # sensitive to frequency within a document. In contrast, because idf only
+      # considers the presence of a token in a document, it is insensitive
+      # to the frequency of the token within the document.
+      {
+          "testcase_name":
+              "retained_tokens_sensitive_to_within_document_frequency",
+          "data":
+              np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
+                        [725, 203]]),
+          "vocab_size":
+              3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 42, 1729, 725, 203]),
+              "counts": np.array([4, 2, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 42]),
+          },
+      })
+  def test_combiner_computation(self, data, vocab_size,
+                                expected_accumulator_output,
+                                expected_extract_output):
+    combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_computation(combiner, data, expected_accumulator)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@ -0,0 +1,112 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class IntegerLookup(index_lookup.IndexLookup):
+  """Maps integers from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary integers into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_values: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_tokens - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary values to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_value: A value that represents masked inputs, and which is mapped to
+      index 0. Defaults to 0. If set to None, no mask term will be added and the
+      OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
+      (1...num_oov_tokens+1).
+    oov_value: The value representing an out-of-vocabulary value. Defaults to
+      -1.
+    vocabulary: An optional list of values, or a path to a text file containing
+      a vocabulary to load into this layer. The file should contain one value
+      per line. If the list or file contains the same token multiple times, an
+      error will be thrown.
+  """
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               **kwargs):
+    allowed_dtypes = [dtypes.int64]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("IntegerLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64
+
+    # If max_values is set, the value must be greater than 1 - otherwise we
+    # are creating a 0-element vocab, which doesn't make sense.
+    if max_values is not None and max_values <= 1:
+      raise ValueError("If set, max_values must be greater than 1.")
+
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+        vocabulary = [int(v) for v in vocabulary]
+
+    super(IntegerLookup, self).__init__(
+        max_tokens=max_values,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_value,
+        oov_token=oov_value,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    base_config = super(IntegerLookup, self).get_config()
+    # Because the super config has a bunch of args we're also passing,
+    # we need to rename and remove them from the config dict.
+    base_config["max_values"] = base_config["max_tokens"]
+    del base_config["max_tokens"]
+
+    base_config["mask_value"] = base_config["mask_token"]
+    del base_config["mask_token"]
+
+    base_config["oov_value"] = base_config["oov_token"]
+    del base_config["oov_token"]
+    return base_config
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@ -0,0 +1,501 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return integer_lookup.IntegerLookup
+  else:
+    return integer_lookup_v1.IntegerLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_ints_soft_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_values": None,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },)
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupLayerTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # IntegerLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"IntegerLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 133], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_values=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_value=0,
+        oov_value=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None, num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_adapt(self):
+    vocab_data = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 2]],
+        values=[203, 1729, 203],
+        dense_shape=[3, 4])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_ragged_adapt(self):
+    vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_single_int_generator_dataset(self):
+
+    def word_gen():
+      for _ in itertools.count(1):
+        yield random.randint(0, 100)
+
+    ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
+                                            tensor_shape.TensorShape([]))
+    batched_ds = ds.take(2)
+    input_t = keras.Input(shape=(), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+    _ = layer(input_t)
+    layer.adapt(batched_ds)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupOutputTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_output_shape(self):
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    int_data = layer(input_data)
+    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+  def test_int_output_no_reserved_zero(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, mask_value=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_values=None,
+    )
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(str(vocab) + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_int(self):
+    vocab_data = [42, 1138, 725, 1729]
+    expected_vocab = [0, -1, 42, 1138, 725, 1729]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], np.int64)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = [42, 1138, 725, 1729, 1729]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = [42, 1138, 725, 1729, 42]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class IntegerLookupSaveableTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupErrorTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_too_long_vocab_fails_in_single_setting(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    layer = get_layer_class()(max_values=4, num_oov_indices=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary larger than the maximum vocab.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_zero_max_values_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
+      _ = get_layer_class()(max_values=0, num_oov_indices=1)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupSavingTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_vocabulary_persistence_across_saving(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    # Build and validate a golden model.
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(output_dataset, expected_output)
+
+    # Save the model to disk.
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    # Delete the session and graph to ensure that the loaded model is generated
+    # from scratch.
+    # TODO(b/149526183): Can't clear session when TF2 is disabled.
+    if tf2.enabled():
+      keras.backend.clear_session()
+
+    loaded_model = keras.models.load_model(
+        output_path, custom_objects={"IntegerLookup": get_layer_class()})
+
+    # Ensure that the loaded model is unique (so that the save/load is real)
+    self.assertIsNot(model, loaded_model)
+
+    # Validate correctness of the new model.
+    new_output_dataset = loaded_model.predict(input_array)
+    self.assertAllEqual(new_output_dataset, expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+
+
+class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
+  pass
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@ -0,0 +1,106 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class StringLookup(index_lookup.IndexLookup):
+  """Maps strings from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary strings into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. Defaults to the empty string "". If set to None, no mask term
+      will be added and the OOV tokens, if any, will be indexed from
+      (0...num_oov_indices) instead of (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. Defaults to
+      "[OOV]".
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
+    encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+  """
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[OOV]",
+               vocabulary=None,
+               encoding="utf-8",
+               **kwargs):
+    allowed_dtypes = [dtypes.string]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("StringLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.string
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
+
+    self.encoding = encoding
+
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    config = {"encoding": self.encoding}
+    base_config = super(StringLookup, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_vocabulary(self):
+    if self._table_handler.vocab_size() == 0:
+      return []
+
+    keys, values = self._table_handler.data()
+    # This is required because the MutableHashTable doesn't preserve insertion
+    # order, but we rely on the order of the array to assign indices.
+    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@ -0,0 +1,224 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return string_lookup.StringLookup
+  else:
+    return string_lookup_v1.StringLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": None,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+  )
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupLayerTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # StringLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"StringLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupVocabularyTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_str(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], six.text_type)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class StringLookupSaveableTest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+if __name__ == "__main__":
+  test.main()
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+
+
+class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
+  pass
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None):
    if np.can_cast(array.dtype, np_dtype):
      array = array.astype(np_dtype, casting="safe")
  return array
-
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer):

    self._max_tokens = max_tokens

-    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
-    # modes don't have a PAD value, so we only need to reserve one value.
-    self._reserved_values = 2 if output_mode == INT else 1
-
    # In INT mode, the zero value is reserved for padding (per Keras standard
    # padding approaches). In non-INT modes, there is no padding so we can set
    # the OOV value to zero instead of one.
@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer):
            self._max_vocab_size, compute_idf=output_mode == TFIDF),
        **kwargs)

-    reserve_zero = output_mode in [None, INT]
+    mask_token = "" if output_mode in [None, INT] else None
    self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
+        max_tokens=max_tokens, mask_token=mask_token)

    # If this layer is configured for string or integer output, we do not
    # create a vectorization layer (as the output is not vectorized).
@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer):
    return (keys.numpy(), values.numpy())

  def _get_index_lookup_class(self):
-    return index_lookup.IndexLookup
+    return string_lookup.StringLookup

  def _to_numpy(self, preprocessed_data):
    """Converts preprocessed inputs into numpy arrays."""
@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer):
  def set_vocabulary(self,
                     vocab,
                     df_data=None,
-                     oov_df_value=None,
-                     append=False):
+                     oov_df_value=None):
    """Sets vocabulary (and optionally document frequency) data for this layer.

    This method sets the vocabulary and DF data for this layer directly, instead
    of analyzing a dataset through 'adapt'. It should be used whenever the vocab
    (and optionally document frequency) information is already known. If
-    vocabulary data is already present in the layer, this method will either
-    replace it, if 'append' is set to False, or append to it (if 'append' is set
-    to True).
+    vocabulary data is already present in the layer, this method will replace
+    it.

    Arguments:
      vocab: An array of string tokens.
      df_data: An array of document frequency data. Only necessary if the layer
        output_mode is TFIDF.
      oov_df_value: The document frequency of the OOV token. Only necessary if
-        output_mode is TFIDF. OOV data is optional when appending additional
-        data in TFIDF mode; if an OOV value is supplied it will overwrite the
-        existing OOV value.
-      append: Whether to overwrite or append any existing vocabulary data.
+        output_mode is TFIDF.

    Raises:
      ValueError: If there are too many inputs, the inputs do not match, or
@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer):
                          "be changed after the layer is "
                          "called.").format(mode=self._output_mode))

-    current_table_size = self._index_lookup_layer.vocab_size()
-    self._index_lookup_layer.set_vocabulary(vocab, append)
+    self._index_lookup_layer.set_vocabulary(vocab)

    # When doing raw or integer output, we don't have a Vectorize layer to
    # manage. In this case, we can return directly.
@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer):
      return

    if not self._pad_to_max or self._max_tokens is None:
-      num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
+      num_tokens = self._index_lookup_layer.vocab_size()
      self._vectorize_layer.set_num_elements(num_tokens)

-    # We're only _really_ appending if the table_size is nonzero. This is
-    # important for some sanity checks in tfidf mode (specifically, checking if
-    # oov_df_value is set or not) and handling existing tfidf weight data.
-    append = append if current_table_size > 0 else False
-
    if self._output_mode == TFIDF:
      if df_data is None:
        raise ValueError("df_data must be set if output_mode is TFIDF")
@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer):
        raise ValueError("df_data must be the same length as vocab. "
                         "len(df_data) is %s, len(vocab) is %s" %
                         (len(vocab), len(df_data)))
-      if not append and oov_df_value is None:
-        raise ValueError("You must pass an oov_df_value the first time "
-                         "'set_vocabulary' is called when output_mode is "
+      if oov_df_value is None:
+        raise ValueError("You must pass an oov_df_value when output_mode is "
                         "TFIDF.")

      df_data = self._convert_to_ndarray(df_data)
-      if append:
-        # The existing IDF data is stored in a Keras weight, so we can get it
-        # by calling K.get_value() on the weight object. Take the first
-        # table_size+1 values in case we're padding the weight with zeros
-        existing_df_data = K.get_value(
-            self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
-        df_data = np.append(existing_df_data, df_data, axis=0)
-        # If we are appending and need to replace the OOV DF value, we can
-        # assign it over the existing OOV DF value at index 0 of the (already-
-        # concatenated) DF value array.
-        if oov_df_value is not None:
-          df_data[0] = oov_df_value
-      else:
-        # If we are not appending (that is, we have only new data) we need to
-        # insert the OOV value to the front of the array. (This is a append to
-        # the head, not a replacement of the zeroth value.)
-        if not isinstance(oov_df_value, np.ndarray):
-          oov_df_value = np.array([oov_df_value])
-        df_data = np.insert(df_data, 0, oov_df_value)
+      if not isinstance(oov_df_value, np.ndarray):
+        oov_df_value = np.array([oov_df_value])
+      df_data = np.insert(df_data, 0, oov_df_value)
      self._vectorize_layer.set_tfidf_data(df_data)

  def build(self, input_shape):
@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
    if not self.built:
      raise RuntimeError("_set_state_variables() must be called after build().")
    if self._output_mode == TFIDF:
-      self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
-                          updates[_OOV_IDF_NAME])
+      self.set_vocabulary(
+          updates[_VOCAB_NAME],
+          updates[_IDF_NAME],
+          updates[_OOV_IDF_NAME])
    else:
      self.set_vocabulary(updates[_VOCAB_NAME])

--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@ -619,25 +619,6 @@ class TextVectorizationOutputTest(
    output_dataset = model.predict(input_array)
    self.assertAllEqual(expected_output, output_dataset)

-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
  def test_int_output_densifies_with_zeros(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    # Create an input array that has 5 elements in the first example and 4 in
@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest(
        split=None,
        output_mode=text_vectorization.TFIDF,
        pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
+    layer.set_vocabulary(
+        vocab_data,
+        df_data=tfidf_data,
+        oov_df_value=.05)
    int_data = layer(input_data)
    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())

@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest(
    output_dataset = model.predict(input_array)
    self.assertAllClose(expected_output, output_dataset)

-  def test_tfidf_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [.1, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_tfidf_appending_with_oov_replacement(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [1.5, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    # Note that here we've replaced the OOV vaue.
-    layer.set_vocabulary(
-        vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
  def test_accept_1D_input(self):
    input_array = np.array(["earth wind and fire",
                            "fire and earth michigan"])
@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                "vocabulary larger than the maximum vocab.*"):
      layer.set_vocabulary(vocab_data)

-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-
-    layer = get_layer_class()(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-
-    # The first time we call set_vocabulary, we're under the max_tokens limit
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
  def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
    vocab_data = ["earth", "wind", "and", "fire"]

@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                "You must pass an oov_df_value.*"):
      layer.set_vocabulary(vocab_data, df_data)

-  def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    df_data = [1, 2, 3, 4]
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    with self.assertRaisesRegex(ValueError,
-                                "You must pass an oov_df_value.*"):
-      layer.set_vocabulary(vocab_data, df_data, append=True)
-
  def test_set_tfidf_in_non_tfidf_fails(self):
    vocab_data = ["earth", "wind", "and", "fire"]
    df_data = [1, 2, 3, 4]
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
-from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
    return categorical_encoding_v1.CategoricalEncoding

  def _get_index_lookup_class(self):
-    return index_lookup_v1.IndexLookup
+    return string_lookup_v1.StringLookup

  def _to_numpy(self, data):
    """Converts preprocessed inputs into numpy arrays."""
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@ -221,7 +221,7 @@ tf_class {
  }
  member_method {
    name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
  }
  member_method {
    name: "set_weights"
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@ -219,7 +219,7 @@ tf_class {
  }
  member_method {
    name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
  }
  member_method {
    name: "set_weights"