Adding hashing trick for Hashing layer.

PiperOrigin-RevId: 294821319 Change-Id: Id127f5df76311bc3904dd7f68c628b310cbc9e85
2020-02-12 20:10:26 -08:00 · 2020-02-12 20:10:26 -08:00 · 0d23b37aaf
commit 0d23b37aaf
parent f6a8bcd0f2
2 changed files with 136 additions and 36 deletions
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import functools
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
@ -35,16 +37,39 @@ class Hashing(Layer):
  tensorflow::ops::Fingerprint to produce universal output that is consistent
  across platforms.
-  Usage:
+  This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
  which provides a consistent hashed output across different platforms and is
  stable across invocations, regardless of device and context, by mixing the
  input bits thoroughly.
  If you want to obfuscate the hashed output, you can also pass a random `salt`
  argument in the constructor. In that case, the layer will use the
  [SipHash64](https://github.com/google/highwayhash) hash function, with
  the `salt` value serving as additional input to the hash function.
  Example (FarmHash64):
  ```python
    layer = Hashing(num_bins=3)
-    inp = np.asarray([['A', 'B'], ['C', 'A']])
+    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
    layer(inputs)
-    [[0, 0], [1, 0]]
+    [[1], [0], [1], [1], [2]]
  ```
  Example (SipHash64):
  ```python
    layer = Hashing(num_bins=3, salt=[133, 137])
    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
    layer(inputs)
    [[1], [2], [1], [0], [2]]
  ```
  Arguments:
    num_bins: Number of hash bins.
    salt: A tuple/list of 2 unsigned integer numbers. If passed, the hash
      function used will be SipHash64, with these values used as an additional
      input (known as a "salt" in cryptography).
      These should be non-zero. Defaults to `None` (in that
      case, the FarmHash64 hash function is used).
    name: Name to give to the layer.
    **kwargs: Keyword arguments to construct a layer.
@ -53,38 +78,46 @@ class Hashing(Layer):
  Output shape: An int64 tensor of shape `[batch_size, d1, ..., dm]`
  Example:
    If the input is a 5 by 1 string tensor '[['A'], ['B'], ['C'], ['D'], ['E']]'
    with `num_bins=2`, then output is 5 by 1 integer tensor
    [[hash('A')], [hash('B')], [hash('C')], [hash('D')], [hash('E')]].
  """
-  def __init__(self, num_bins, name=None, **kwargs):
+  def __init__(self, num_bins, salt=None, name=None, **kwargs):
-    # TODO(tanzheny): consider adding strong hash variant.
+    if num_bins is None or num_bins <= 0:
      raise ValueError('`num_bins` cannot be `None` or non-positive values.')
    if salt is not None:
      if not isinstance(salt, (tuple, list)) or len(salt) != 2:
        raise ValueError('`salt` must be a tuple or list of 2 unsigned '
                         'integer numbers, got {}'.format(salt))
    super(Hashing, self).__init__(name=name, **kwargs)
-    self._num_bins = num_bins
+    self.num_bins = num_bins
    self.salt = salt
    self._supports_ragged_inputs = True
  def call(self, inputs):
    # TODO(tanzheny): Add int support.
-    # string_to_hash_bucket_fast uses FarmHash as hash function.
+    str_to_hash_bucket = self._get_string_to_hash_bucket_fn()
    if ragged_tensor.is_ragged(inputs):
      return ragged_functional_ops.map_flat_values(
-          string_ops.string_to_hash_bucket_fast,
+          str_to_hash_bucket, inputs, num_buckets=self.num_bins, name='hash')
          inputs,
          num_buckets=self._num_bins,
          name='hash')
    elif isinstance(inputs, sparse_tensor.SparseTensor):
      sparse_values = inputs.values
-      sparse_hashed_values = string_ops.string_to_hash_bucket_fast(
+      sparse_hashed_values = str_to_hash_bucket(
-          sparse_values, self._num_bins, name='hash')
+          sparse_values, self.num_bins, name='hash')
      return sparse_tensor.SparseTensor(
          indices=inputs.indices,
          values=sparse_hashed_values,
          dense_shape=inputs.dense_shape)
    else:
-      return string_ops.string_to_hash_bucket_fast(
+      return str_to_hash_bucket(inputs, self.num_bins, name='hash')
-          inputs, self._num_bins, name='hash')
+
  def _get_string_to_hash_bucket_fn(self):
    """Returns the string_to_hash_bucket op to use based on `hasher_key`."""
    # string_to_hash_bucket_fast uses FarmHash64 as hash function.
    if self.salt is None:
      return string_ops.string_to_hash_bucket_fast
    # string_to_hash_bucket_strong uses SipHash64 as hash function.
    else:
      return functools.partial(
          string_ops.string_to_hash_bucket_strong, key=self.salt)
  def compute_output_shape(self, input_shape):
    return input_shape
@ -99,6 +132,6 @@ class Hashing(Layer):
      return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
  def get_config(self):
-    config = {'num_bins': self._num_bins}
+    config = {'num_bins': self.num_bins, 'salt': self.salt}
    base_config = super(Hashing, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@ -40,43 +41,109 @@ class HashingTest(keras_parameterized.TestCase):
    layer = hashing.Hashing(num_bins=1)
    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
    output = layer(inp)
-    self.assertAllClose(np.asarray([[0], [0], [0], [0], [0]]), output)
+    self.assertAllClose([[0], [0], [0], [0], [0]], output)
-  def test_hash_two_bins(self):
+  def test_hash_dense_input_farmhash(self):
    layer = hashing.Hashing(num_bins=2)
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+    inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                      ['skywalker']])
    output = layer(inp)
-    self.assertEqual(output.numpy().max(), 1)
+    # Assert equal for hashed output that should be true on all platforms.
-    self.assertEqual(output.numpy().min(), 0)
+    self.assertAllClose([[0], [0], [1], [0], [0]], output)
-  def test_hash_sparse_input(self):
+  def test_hash_dense_input_siphash(self):
    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
    inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                      ['skywalker']])
    output = layer(inp)
    # Assert equal for hashed output that should be true on all platforms.
    # Note the result is different from FarmHash.
    self.assertAllClose([[0], [1], [0], [1], [0]], output)
    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
    output_2 = layer_2(inp)
    # Note the result is different from (133, 137).
    self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
  def test_hash_sparse_input_farmhash(self):
    layer = hashing.Hashing(num_bins=2)
    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
    inp = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]],
+        indices=indices,
        values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
        dense_shape=[3, 2])
    output = layer(inp)
-    self.assertEqual(output.values.numpy().max(), 1)
+    self.assertAllClose(indices, output.indices)
-    self.assertEqual(output.values.numpy().min(), 0)
+    self.assertAllClose([0, 0, 1, 0, 0], output.values)
-  def test_hash_ragged_string_input(self):
+  def test_hash_sparse_input_siphash(self):
    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
    indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
    inp = sparse_tensor.SparseTensor(
        indices=indices,
        values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
        dense_shape=[3, 2])
    output = layer(inp)
    self.assertAllClose(output.indices, indices)
    # The result should be same with test_hash_dense_input_siphash.
    self.assertAllClose([0, 1, 0, 1, 0], output.values)
    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
    output = layer_2(inp)
    # The result should be same with test_hash_dense_input_siphash.
    self.assertAllClose([1, 0, 1, 0, 1], output.values)
  def test_hash_ragged_string_input_farmhash(self):
    layer = hashing.Hashing(num_bins=2)
    inp_data = ragged_factory_ops.constant(
        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
        dtype=dtypes.string)
    out_data = layer(inp_data)
-    self.assertEqual(out_data.values.numpy().max(), 1)
+    # Same hashed output as test_hash_sparse_input_farmhash
-    self.assertEqual(out_data.values.numpy().min(), 0)
+    expected_output = [[0, 0, 1, 0], [1, 0, 0]]
-    # hash of 'marlo' should be same.
+    self.assertAllEqual(expected_output, out_data)
    self.assertAllClose(out_data[0][2], out_data[1][0])
    # hash of 'wire' should be same.
    self.assertAllClose(out_data[0][3], out_data[1][2])
    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
    out_t = layer(inp_t)
    model = training.Model(inputs=inp_t, outputs=out_t)
    self.assertAllClose(out_data, model.predict(inp_data))
  def test_hash_ragged_string_input_siphash(self):
    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
    inp_data = ragged_factory_ops.constant(
        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
        dtype=dtypes.string)
    out_data = layer(inp_data)
    # Same hashed output as test_hash_dense_input_siphash
    expected_output = [[0, 1, 0, 1], [0, 0, 1]]
    self.assertAllEqual(expected_output, out_data)
    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
    out_t = layer(inp_t)
    model = training.Model(inputs=inp_t, outputs=out_t)
    self.assertAllClose(out_data, model.predict(inp_data))
    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
    out_data = layer_2(inp_data)
    expected_output = [[1, 0, 1, 0], [1, 1, 0]]
    self.assertAllEqual(expected_output, out_data)
    out_t = layer_2(inp_t)
    model = training.Model(inputs=inp_t, outputs=out_t)
    self.assertAllClose(out_data, model.predict(inp_data))
  def test_invalid_inputs(self):
    with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
      _ = hashing.Hashing(num_bins=None)
    with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
      _ = hashing.Hashing(num_bins=-1)
    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
      _ = hashing.Hashing(num_bins=2, salt='string')
    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
      _ = hashing.Hashing(num_bins=2, salt=[1])
    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
      _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
  def test_hash_compute_output_signature(self):
    input_shape = tensor_shape.TensorShape([2, 3])
    input_spec = tensor_spec.TensorSpec(input_shape, dtypes.string)