Make CategoryEncoding work with 1D inputs and list inputs.

Also add error message when passing max_tokens=None and not calling adapt. Note that the 1D fix won't apply to SparseTensor and RaggedTensor at this time. PiperOrigin-RevId: 315803768 Change-Id: I7d302a2a9009ad63db3c5fb6a4209f63da8f2635
2020-06-10 17:43:48 -07:00 · 2020-06-10 17:43:48 -07:00 · a745f0a953
commit a745f0a953
parent 5d1368571f
3 changed files with 33 additions and 0 deletions
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@ -267,12 +267,22 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
    K.set_value(self.tf_idf_weights, tfidf_data)
  def call(self, inputs, count_weights=None):
    if isinstance(inputs, (list, np.ndarray)):
      inputs = ops.convert_to_tensor_v2(inputs)
    if inputs.shape.rank == 1:
      inputs = array_ops.expand_dims(inputs, 1)
    if count_weights is not None and self._output_mode != COUNT:
      raise ValueError("count_weights is not used in `output_mode='tf-idf'`, "
                       "or `output_mode='binary'`. Please pass a single input.")
    self._called = True
    if self._max_tokens is None:
      out_depth = K.get_value(self.num_elements)
      if out_depth == 0:
        raise RuntimeError(
            "If you construct a `CategoryEncoding` layer with "
            "`max_tokens=None`, you need to call `adapt()` "
            "on it before using it")
    else:
      out_depth = self._max_tokens
@ -352,6 +362,8 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
    # TODO(momernick): Benchmark improvements to this algorithm.
    for element in values:
      if not isinstance(element, list):
        element = [element]
      current_doc_id = accumulator.data[self.DOC_ID_IDX]
      for value in element:
        current_max_value = accumulator.data[self.MAX_VALUE_IDX]
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@ -405,6 +405,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
    layer = get_layer_class()(
        max_tokens=None, output_mode=category_encoding.BINARY)
    layer.adapt([1, 2])
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
      layer.set_num_elements(5)
@ -415,6 +416,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
    layer = get_layer_class()(
        max_tokens=None, output_mode=category_encoding.BINARY)
    layer.adapt(vocab_data)
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
      layer.adapt(vocab_data)
@ -425,6 +427,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
    layer = get_layer_class()(
        max_tokens=None, output_mode=category_encoding.BINARY)
    layer.adapt([1, 2])
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
      layer._set_state_variables(state_variables)
@ -741,6 +744,21 @@ class CategoryEncodingCombinerTest(
    self.validate_accumulator_computation(combiner, data, expected_accumulator)
    self.validate_accumulator_extract(combiner, data, expected_extract_output)
  def test_1d_data(self):
    data = [1, 2, 3]
    cls = get_layer_class()
    layer = cls()
    layer.adapt(data)
    output = layer(data)
    self.assertListEqual(output.shape.as_list(), [3, 4])
  def test_no_adapt_exception(self):
    cls = get_layer_class()
    layer = cls()
    with self.assertRaisesRegex(
        RuntimeError, r".*you need to call.*"):
      _ = layer([1, 2, 3])
 if __name__ == "__main__":
  test.main()
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@ -1037,6 +1037,7 @@ class TextVectorizationOutputTest(
        split=None,
        output_mode=text_vectorization.BINARY,
        pad_to_max_tokens=False)
    layer.adapt(vocab_data)
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
      layer.set_vocabulary(vocab_data)
@ -1054,6 +1055,7 @@ class TextVectorizationOutputTest(
        split=None,
        output_mode=text_vectorization.BINARY,
        pad_to_max_tokens=False)
    layer.adapt(vocab_data)
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "can't be adapted after being"):
      layer.adapt(vocab_data)
@ -1070,6 +1072,7 @@ class TextVectorizationOutputTest(
        split=None,
        output_mode=text_vectorization.BINARY,
        pad_to_max_tokens=False)
    layer.adapt(["earth", "wind"])
    _ = layer(input_data)
    with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
      layer._set_state_variables(state_variables)