Embedding feature column performance optimization.

PiperOrigin-RevId: 292193767
Change-Id: I92006247b40fa0025bab6f35ac74e44ef43c2397
This commit is contained in:
A. Unique TensorFlower 2020-01-29 12:19:21 -08:00 committed by TensorFlower Gardener
parent 7db3d7abe3
commit f1e95d1ba1
9 changed files with 488 additions and 121 deletions

View File

@ -821,7 +821,8 @@ def _embedding_column(categorical_column,
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True):
trainable=True,
use_safe_embedding_lookup=True):
"""`_DenseColumn` that converts from sparse, categorical input.
Use this when your inputs are sparse, but you want to convert them to a dense
@ -882,6 +883,13 @@ def _embedding_column(categorical_column,
not `None`.
max_norm: If not `None`, embedding values are l2-normalized to this value.
trainable: Whether or not the embedding is trainable. Default is True.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
`_DenseColumn` that converts from sparse input.
@ -926,7 +934,8 @@ def _embedding_column(categorical_column,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable)
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
def _numeric_column(key,
@ -2444,9 +2453,32 @@ class _EmbeddingColumn(
collections.namedtuple(
'_EmbeddingColumn',
('categorical_column', 'dimension', 'combiner', 'layer_creator',
'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',
'use_safe_embedding_lookup'))):
"""See `embedding_column`."""
def __new__(cls,
categorical_column,
dimension,
combiner,
layer_creator,
ckpt_to_load_from,
tensor_name_in_ckpt,
max_norm,
trainable,
use_safe_embedding_lookup=True):
return super(_EmbeddingColumn, cls).__new__(
cls,
categorical_column=categorical_column,
dimension=dimension,
combiner=combiner,
layer_creator=layer_creator,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
@property
def name(self):
if not hasattr(self, '_name'):
@ -2489,11 +2521,17 @@ class _EmbeddingColumn(
self.tensor_name_in_ckpt: to_restore
})
sparse_id_rank = tensor_shape.dimension_value(
sparse_ids.dense_shape.get_shape()[0])
embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
sparse_id_rank <= 2):
embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
# Return embedding lookup result.
return embedding_ops.safe_embedding_lookup_sparse(
embedding_weights=embedding_weights,
sparse_ids=sparse_ids,
sparse_weights=sparse_weights,
return embedding_lookup_sparse(
embedding_weights,
sparse_ids,
sparse_weights,
combiner=self.combiner,
name='%s_weights' % self.name,
max_norm=self.max_norm)
@ -2551,7 +2589,8 @@ class _SharedEmbeddingColumn(
'_SharedEmbeddingColumn',
('categorical_column', 'dimension', 'combiner', 'initializer',
'shared_embedding_collection_name', 'ckpt_to_load_from',
'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
'tensor_name_in_ckpt', 'max_norm', 'trainable',
'use_safe_embedding_lookup'))):
"""See `embedding_column`."""
@property
@ -2632,11 +2671,17 @@ class _SharedEmbeddingColumn(
self.tensor_name_in_ckpt: to_restore
})
sparse_id_rank = tensor_shape.dimension_value(
sparse_ids.dense_shape.get_shape()[0])
embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
sparse_id_rank <= 2):
embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
# Return embedding lookup result.
return embedding_ops.safe_embedding_lookup_sparse(
embedding_weights=embedding_weights,
sparse_ids=sparse_ids,
sparse_weights=sparse_weights,
return embedding_lookup_sparse(
embedding_weights,
sparse_ids,
sparse_weights,
combiner=self.combiner,
name='%s_weights' % self.name,
max_norm=self.max_norm)

View File

@ -850,7 +850,8 @@ def embedding_column(categorical_column,
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True):
trainable=True,
use_safe_embedding_lookup=True):
"""`DenseColumn` that converts from sparse, categorical input.
Use this when your inputs are sparse, but you want to convert them to a dense
@ -911,6 +912,13 @@ def embedding_column(categorical_column,
`None`.
max_norm: If not `None`, embedding values are l2-normalized to this value.
trainable: Whether or not the embedding is trainable. Default is True.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
`DenseColumn` that converts from sparse input.
@ -944,7 +952,8 @@ def embedding_column(categorical_column,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable)
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
@tf_export(v1=['feature_column.shared_embedding_columns'])
@ -956,7 +965,8 @@ def shared_embedding_columns(categorical_columns,
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True):
trainable=True,
use_safe_embedding_lookup=True):
"""List of dense columns that convert from sparse, categorical input.
This is similar to `embedding_column`, except that it produces a list of
@ -1039,6 +1049,13 @@ def shared_embedding_columns(categorical_columns,
max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
than this value, before combining.
trainable: Whether or not the embedding is trainable. Default is True.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A list of dense columns that converts from sparse input. The order of
@ -1117,7 +1134,8 @@ def shared_embedding_columns(categorical_columns,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable))
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup))
return result
@ -1131,7 +1149,8 @@ def shared_embedding_columns_v2(categorical_columns,
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True):
trainable=True,
use_safe_embedding_lookup=True):
"""List of dense columns that convert from sparse, categorical input.
This is similar to `embedding_column`, except that it produces a list of
@ -1213,6 +1232,13 @@ def shared_embedding_columns_v2(categorical_columns,
max_norm: If not `None`, each embedding is clipped if its l2-norm is
larger than this value, before combining.
trainable: Whether or not the embedding is trainable. Default is True.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A list of dense columns that converts from sparse input. The order of
@ -1277,7 +1303,8 @@ def shared_embedding_columns_v2(categorical_columns,
column_creator = SharedEmbeddingColumnCreator(
dimension, initializer, ckpt_to_load_from, tensor_name_in_ckpt,
num_buckets, trainable, shared_embedding_collection_name)
num_buckets, trainable, shared_embedding_collection_name,
use_safe_embedding_lookup)
result = []
for column in categorical_columns:
@ -3082,9 +3109,32 @@ class EmbeddingColumn(
collections.namedtuple(
'EmbeddingColumn',
('categorical_column', 'dimension', 'combiner', 'initializer',
'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',
'use_safe_embedding_lookup'))):
"""See `embedding_column`."""
def __new__(cls,
categorical_column,
dimension,
combiner,
initializer,
ckpt_to_load_from,
tensor_name_in_ckpt,
max_norm,
trainable,
use_safe_embedding_lookup=True):
return super(EmbeddingColumn, cls).__new__(
cls,
categorical_column=categorical_column,
dimension=dimension,
combiner=combiner,
initializer=initializer,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
@property
def _is_v2_column(self):
return (isinstance(self.categorical_column, FeatureColumn) and
@ -3156,11 +3206,17 @@ class EmbeddingColumn(
self.tensor_name_in_ckpt: to_restore
})
sparse_id_rank = tensor_shape.dimension_value(
sparse_ids.dense_shape.get_shape()[0])
embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
sparse_id_rank <= 2):
embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
# Return embedding lookup result.
return embedding_ops.safe_embedding_lookup_sparse(
embedding_weights=embedding_weights,
sparse_ids=sparse_ids,
sparse_weights=sparse_weights,
return embedding_lookup_sparse(
embedding_weights,
sparse_ids,
sparse_weights,
combiner=self.combiner,
name='%s_weights' % self.name,
max_norm=self.max_norm)
@ -3301,6 +3357,8 @@ class EmbeddingColumn(
@classmethod
def from_config(cls, config, custom_objects=None, columns_by_name=None):
"""See 'FeatureColumn` base class."""
if 'use_safe_embedding_lookup' not in config:
config['use_safe_embedding_lookup'] = True
from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top
_check_config_keys(config, cls._fields)
kwargs = _standardize_and_copy_config(config)
@ -3326,7 +3384,8 @@ class SharedEmbeddingColumnCreator(tracking.AutoTrackable):
tensor_name_in_ckpt,
num_buckets,
trainable,
name='shared_embedding_column_creator'):
name='shared_embedding_column_creator',
use_safe_embedding_lookup=True):
self._dimension = dimension
self._initializer = initializer
self._ckpt_to_load_from = ckpt_to_load_from
@ -3334,11 +3393,13 @@ class SharedEmbeddingColumnCreator(tracking.AutoTrackable):
self._num_buckets = num_buckets
self._trainable = trainable
self._name = name
self._use_safe_embedding_lookup = use_safe_embedding_lookup
# Map from graph keys to embedding_weight variables.
self._embedding_weights = {}
def __call__(self, categorical_column, combiner, max_norm):
return SharedEmbeddingColumn(categorical_column, self, combiner, max_norm)
return SharedEmbeddingColumn(categorical_column, self, combiner, max_norm,
self._use_safe_embedding_lookup)
@property
def embedding_weights(self):
@ -3374,9 +3435,23 @@ class SharedEmbeddingColumn(
collections.namedtuple(
'SharedEmbeddingColumn',
('categorical_column', 'shared_embedding_column_creator', 'combiner',
'max_norm'))):
'max_norm', 'use_safe_embedding_lookup'))):
"""See `embedding_column`."""
def __new__(cls,
categorical_column,
shared_embedding_column_creator,
combiner,
max_norm,
use_safe_embedding_lookup=True):
return super(SharedEmbeddingColumn, cls).__new__(
cls,
categorical_column=categorical_column,
shared_embedding_column_creator=shared_embedding_column_creator,
combiner=combiner,
max_norm=max_norm,
use_safe_embedding_lookup=use_safe_embedding_lookup)
@property
def _is_v2_column(self):
return True
@ -3426,11 +3501,17 @@ class SharedEmbeddingColumn(
embedding_weights = self.shared_embedding_column_creator.embedding_weights
sparse_id_rank = tensor_shape.dimension_value(
sparse_ids.dense_shape.get_shape()[0])
embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
sparse_id_rank <= 2):
embedding_lookup_sparse = (embedding_ops.embedding_lookup_sparse)
# Return embedding lookup result.
return embedding_ops.safe_embedding_lookup_sparse(
embedding_weights=embedding_weights,
sparse_ids=sparse_ids,
sparse_weights=sparse_weights,
return embedding_lookup_sparse(
embedding_weights,
sparse_ids,
sparse_weights,
combiner=self.combiner,
name='%s_weights' % self.name,
max_norm=self.max_norm)

View File

@ -21,6 +21,7 @@ from __future__ import print_function
import collections
import copy
from absl.testing import parameterized
import numpy as np
from tensorflow.core.example import example_pb2
@ -5704,7 +5705,7 @@ class _TestStateManager(fc.StateManager):
raise ValueError('Could not find variable.')
class EmbeddingColumnTest(test.TestCase):
class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
@test_util.run_deprecated_v1
def test_defaults(self):
@ -6272,8 +6273,16 @@ class EmbeddingColumnTest(test.TestCase):
self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
self.evaluate(predictions))
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False
})
@test_util.run_deprecated_v1
def test_dense_features(self):
def test_dense_features(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
sparse_input = sparse_tensor.SparseTensorValue(
@ -6317,7 +6326,8 @@ class EmbeddingColumnTest(test.TestCase):
embedding_column = fc.embedding_column(
categorical_column,
dimension=embedding_dimension,
initializer=_initializer)
initializer=_initializer,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
l = df.DenseFeatures((embedding_column,))
@ -6339,6 +6349,14 @@ class EmbeddingColumnTest(test.TestCase):
self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
if use_safe_embedding_lookup:
self.assertIn('SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
else:
self.assertNotIn(
'SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
@test_util.run_deprecated_v1
def test_dense_features_not_trainable(self):
# Inputs.
@ -6646,31 +6664,33 @@ class EmbeddingColumnTest(test.TestCase):
self.assertEqual([categorical_column], embedding_column.parents)
config = embedding_column.get_config()
self.assertEqual({
'categorical_column': {
'class_name': 'IdentityCategoricalColumn',
'config': {
'number_buckets': 3,
'key': 'aaa',
'default_value': None
}
},
'ckpt_to_load_from': None,
'combiner': 'mean',
'dimension': 2,
'initializer': {
'class_name': 'TruncatedNormal',
'config': {
'dtype': 'float32',
'stddev': 0.7071067811865475,
'seed': None,
'mean': 0.0
}
},
'max_norm': None,
'tensor_name_in_ckpt': None,
'trainable': True
}, config)
self.assertEqual(
{
'categorical_column': {
'class_name': 'IdentityCategoricalColumn',
'config': {
'number_buckets': 3,
'key': 'aaa',
'default_value': None
}
},
'ckpt_to_load_from': None,
'combiner': 'mean',
'dimension': 2,
'initializer': {
'class_name': 'TruncatedNormal',
'config': {
'dtype': 'float32',
'stddev': 0.7071067811865475,
'seed': None,
'mean': 0.0
}
},
'max_norm': None,
'tensor_name_in_ckpt': None,
'trainable': True,
'use_safe_embedding_lookup': True
}, config)
custom_objects = {'TruncatedNormal': init_ops.TruncatedNormal}
new_embedding_column = fc.EmbeddingColumn.from_config(
@ -6707,28 +6727,33 @@ class EmbeddingColumnTest(test.TestCase):
self.assertEqual([categorical_column], embedding_column.parents)
config = embedding_column.get_config()
self.assertEqual({
'categorical_column': {
'class_name': 'IdentityCategoricalColumn',
'config': {
'number_buckets': 3,
'key': 'aaa',
'default_value': None
}
},
'ckpt_to_load_from': None,
'combiner': 'mean',
'dimension': 2,
'initializer': '_initializer',
'max_norm': None,
'tensor_name_in_ckpt': None,
'trainable': True
}, config)
self.assertEqual(
{
'categorical_column': {
'class_name': 'IdentityCategoricalColumn',
'config': {
'number_buckets': 3,
'key': 'aaa',
'default_value': None
}
},
'ckpt_to_load_from': None,
'combiner': 'mean',
'dimension': 2,
'initializer': '_initializer',
'max_norm': None,
'tensor_name_in_ckpt': None,
'trainable': True,
'use_safe_embedding_lookup': True
}, config)
custom_objects = {
'_initializer': _initializer,
}
# use_safe_embedding_lookup might not be populated for legacy reasons.
del config['use_safe_embedding_lookup']
new_embedding_column = fc.EmbeddingColumn.from_config(
config, custom_objects=custom_objects)
self.assertEqual(embedding_column, new_embedding_column)
@ -6746,7 +6771,7 @@ class EmbeddingColumnTest(test.TestCase):
self.assertIs(categorical_column, new_embedding_column.categorical_column)
class SharedEmbeddingColumnTest(test.TestCase):
class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
@test_util.run_deprecated_v1
def test_defaults(self):
@ -6952,8 +6977,16 @@ class SharedEmbeddingColumnTest(test.TestCase):
_assert_sparse_tensor_value(self, self.evaluate(output_b),
self.evaluate(output_b_embedded))
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False
})
@test_util.run_deprecated_v1
def test_get_dense_tensor(self):
def test_get_dense_tensor(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
# -1 values are ignored.
@ -6988,12 +7021,18 @@ class SharedEmbeddingColumnTest(test.TestCase):
# example 1:
(2., 3.5), # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
)
expected_lookups_b = (
# example 0:
(1., 2.), # ids [0], embedding = [1, 2]
# example 1:
(0., 0.), # ids [], embedding = [0, 0]
)
if use_safe_embedding_lookup:
expected_lookups_b = (
# example 0:
(1., 2.), # ids [0], embedding = [1, 2]
# example 1:
(0., 0.), # ids [], embedding = [0, 0]
)
else:
expected_lookups_b = (
# example 0:
(1., 2.), # ids [0], embedding = [1, 2]
)
# Build columns.
categorical_column_a = fc.categorical_column_with_identity(
@ -7003,7 +7042,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
[categorical_column_a, categorical_column_b],
dimension=embedding_dimension,
initializer=_initializer)
initializer=_initializer,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
embedding_lookup_a = embedding_column_a.get_dense_tensor(
@ -7024,8 +7064,112 @@ class SharedEmbeddingColumnTest(test.TestCase):
self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
if use_safe_embedding_lookup:
self.assertIn('SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
else:
self.assertNotIn(
'SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False
})
@test_util.run_deprecated_v1
def test_get_dense_tensor_placeholder_inputs(self):
def test_get_dense_tensor_valid(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
# -1 values are ignored.
input_a = np.array([
[2, 1], # example 0, ids [2, 1]
[0, -1]
]) # example 1, ids [0]
input_b = np.array([
[1, -1], # example 0, ids [1]
[1, 2]
]) # example 1, ids [1, 2]
input_features = {'aaa': input_a, 'bbb': input_b}
# Embedding variable.
embedding_dimension = 2
embedding_values = (
(1., 2.), # id 0
(3., 5.), # id 1
(7., 11.) # id 2
)
def _initializer(shape, dtype, partition_info=None):
self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
self.assertEqual(dtypes.float32, dtype)
self.assertIsNone(partition_info)
return embedding_values
# Expected lookup result, using combiner='mean'.
expected_lookups_a = (
# example 0:
(5., 8.), # ids [2, 1], embedding = mean([3, 5] + [7, 11]) = [5, 8]
# example 1:
(1., 2), # ids [0], embedding = [1, 2]
)
expected_lookups_b = (
# example 0:
(3., 5.), # ids [1], embedding = [3, 5]
# example 1:
(5., 8.), # ids [1, 2], embedding = mean([3, 5] + [7, 11]) = [5, 8]
)
# Build columns.
categorical_column_a = fc.categorical_column_with_identity(
key='aaa', num_buckets=vocabulary_size)
categorical_column_b = fc.categorical_column_with_identity(
key='bbb', num_buckets=vocabulary_size)
embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
[categorical_column_a, categorical_column_b],
dimension=embedding_dimension,
initializer=_initializer,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
embedding_lookup_a = embedding_column_a.get_dense_tensor(
fc.FeatureTransformationCache(input_features), None)
embedding_lookup_b = embedding_column_b.get_dense_tensor(
fc.FeatureTransformationCache(input_features), None)
# Assert expected embedding variable and lookups.
global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
self.assertCountEqual(('aaa_bbb_shared_embedding:0',),
tuple([v.name for v in global_vars]))
embedding_var = global_vars[0]
self.evaluate(variables_lib.global_variables_initializer())
self.evaluate(lookup_ops.tables_initializer())
self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
if use_safe_embedding_lookup:
self.assertIn('SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
else:
self.assertNotIn(
'SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False
})
@test_util.run_deprecated_v1
def test_get_dense_tensor_placeholder_inputs(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
# -1 values are ignored.
@ -7073,13 +7217,21 @@ class SharedEmbeddingColumnTest(test.TestCase):
embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
[categorical_column_a, categorical_column_b],
dimension=embedding_dimension,
initializer=_initializer)
initializer=_initializer,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
embedding_lookup_a = embedding_column_a.get_dense_tensor(
fc.FeatureTransformationCache(input_features), None)
embedding_lookup_b = embedding_column_b.get_dense_tensor(
fc.FeatureTransformationCache(input_features), None)
if use_safe_embedding_lookup:
self.assertIn('SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
else:
self.assertNotIn(
'SparseFillEmptyRows',
[x.type for x in ops.get_default_graph().get_operations()])
with _initialized_session() as sess:
sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)

View File

@ -57,7 +57,8 @@ def embedding_column(categorical_column,
combiner='mean',
initializer=None,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
"""TPU embedding_column for `tf.feature_column.embedding_column`.
Note that the interface for TPU embedding_column is different from the non-TPU
@ -86,6 +87,13 @@ def embedding_column(categorical_column,
sequence features and 0 for non-sequence features.
learning_rate_fn: A function that takes global step and returns learning
rate for the embedding table.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A _TPUEmbeddingColumn.
@ -137,7 +145,8 @@ def embedding_column(categorical_column,
max_norm=None,
trainable=True,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn)
learning_rate_fn=learning_rate_fn,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# For Embedding column, the initializer is hidden inside the creator Fn, which
# is not accessiable later. So, we attach it to a speicial field. Also note
# that non-TPU Embedding column and non-TPU shared Embedding column handle the
@ -152,7 +161,8 @@ def shared_embedding_columns(categorical_columns,
initializer=None,
shared_embedding_collection_name=None,
max_sequence_lengths=None,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
"""List of dense columns that convert from sparse, categorical input.
Note that the interface for TPU embedding_column is different from the non-TPU
@ -187,6 +197,13 @@ def shared_embedding_columns(categorical_columns,
sequence longer will be truncated.
learning_rate_fn: A function that takes global step and returns learning
rate for the embedding table.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A _TPUEmbeddingColumn.
@ -261,7 +278,8 @@ def shared_embedding_columns(categorical_columns,
max_norm=None,
trainable=True,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn)
learning_rate_fn=learning_rate_fn,
use_safe_embedding_lookup=use_safe_embedding_lookup)
tpu_columns.append(column)
return tpu_columns
@ -347,7 +365,8 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
max_norm=None,
trainable=True,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
# Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
# are not supported on TPU. They are solely for matching the signature of
# __new__ of parent class fc._EmbeddingColumn.
@ -360,7 +379,8 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable)
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
def __init__(self,
categorical_column,
@ -372,7 +392,8 @@ class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
max_norm=None,
trainable=True,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
_TPUBaseEmbeddingColumn.__init__(
self,
categorical_column,
@ -479,7 +500,8 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
max_norm=None,
trainable=True,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
return fc._SharedEmbeddingColumn.__new__(
cls,
categorical_column,
@ -490,7 +512,8 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
ckpt_to_load_from=ckpt_to_load_from,
tensor_name_in_ckpt=tensor_name_in_ckpt,
max_norm=max_norm,
trainable=trainable)
trainable=trainable,
use_safe_embedding_lookup=use_safe_embedding_lookup)
def __init__(self,
categorical_column,
@ -503,7 +526,8 @@ class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
max_norm=None,
trainable=True,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
_TPUBaseEmbeddingColumn.__init__(
self,

View File

@ -56,7 +56,8 @@ def embedding_column_v2(categorical_column,
max_sequence_length=0,
learning_rate_fn=None,
embedding_lookup_device=None,
tensor_core_shape=None):
tensor_core_shape=None,
use_safe_embedding_lookup=True):
"""TPU version of `tf.compat.v1.feature_column.embedding_column`.
Note that the interface for `tf.tpu.experimental.embedding_column` is
@ -122,6 +123,13 @@ def embedding_column_v2(categorical_column,
the intended dense shape to run embedding lookup for this feature on
TensorCore. The batch dimension can be left None or -1 to indicate
a dynamic shape. Only rank 2 shapes currently supported.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A `_TPUEmbeddingColumnV2`.
@ -175,7 +183,8 @@ def embedding_column_v2(categorical_column,
combiner=combiner,
initializer=initializer,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn)
learning_rate_fn=learning_rate_fn,
use_safe_embedding_lookup=use_safe_embedding_lookup)
else:
return _TPUDeviceSpecificEmbeddingColumnV2(
categorical_column=categorical_column,
@ -185,7 +194,8 @@ def embedding_column_v2(categorical_column,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn,
embedding_lookup_device=embedding_lookup_device,
tensor_core_shape=tensor_core_shape)
tensor_core_shape=tensor_core_shape,
use_safe_embedding_lookup=use_safe_embedding_lookup)
@tf_export(v1=['tpu.experimental.shared_embedding_columns'])
@ -197,7 +207,8 @@ def shared_embedding_columns_v2(categorical_columns,
max_sequence_lengths=None,
learning_rate_fn=None,
embedding_lookup_device=None,
tensor_core_shape=None):
tensor_core_shape=None,
use_safe_embedding_lookup=True):
"""TPU version of `tf.compat.v1.feature_column.shared_embedding_columns`.
Note that the interface for `tf.tpu.experimental.shared_embedding_columns` is
@ -271,6 +282,13 @@ def shared_embedding_columns_v2(categorical_columns,
intended dense shape to run embedding lookup for this feature on
TensorCore. The batch dimension can be left None or -1 to indicate a
dynamic shape. Only rank 2 shapes currently supported.
use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
there are no empty rows and all weights and ids are positive at the
expense of extra compute cost. This only applies to rank 2 (NxM) shaped
input tensors. Defaults to true, consider turning off if the above checks
are not needed. Note that having empty rows will not trigger any error
though the output result might be 0 or omitted.
Returns:
A list of `_TPUSharedEmbeddingColumnV2`.
@ -364,7 +382,8 @@ def shared_embedding_columns_v2(categorical_columns,
initializer=initializer,
shared_embedding_collection_name=shared_embedding_collection_name,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn)
learning_rate_fn=learning_rate_fn,
use_safe_embedding_lookup=use_safe_embedding_lookup)
else:
column = _TPUSharedDeviceSpecificEmbeddingColumnV2(
categorical_column=categorical_column,
@ -375,7 +394,8 @@ def shared_embedding_columns_v2(categorical_columns,
max_sequence_length=max_sequence_length,
learning_rate_fn=learning_rate_fn,
embedding_lookup_device=embedding_lookup_device,
tensor_core_shape=tensor_core_shape)
tensor_core_shape=tensor_core_shape,
use_safe_embedding_lookup=use_safe_embedding_lookup)
tpu_columns.append(column)
return tpu_columns
@ -390,7 +410,8 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
combiner='mean',
initializer=None,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
return fc_lib.EmbeddingColumn.__new__(
cls,
categorical_column,
@ -400,7 +421,8 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
ckpt_to_load_from=None,
tensor_name_in_ckpt=None,
max_norm=None,
trainable=True)
trainable=True,
use_safe_embedding_lookup=use_safe_embedding_lookup)
def __getnewargs__(self):
return (self._tpu_categorical_column, self.dimension, self.combiner,
@ -416,7 +438,8 @@ class _TPUEmbeddingColumnV2(_TPUBaseEmbeddingColumn, fc_lib.EmbeddingColumn):
combiner='mean',
initializer=None,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
_TPUBaseEmbeddingColumn.__init__(
self,
categorical_column,
@ -573,13 +596,15 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn,
initializer=None,
shared_embedding_collection_name=None,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
return fc_lib.SharedEmbeddingColumn.__new__(
cls,
categorical_column,
combiner=combiner,
shared_embedding_column_creator=shared_embedding_column_creator,
max_norm=None)
max_norm=None,
use_safe_embedding_lookup=use_safe_embedding_lookup)
def __getnewargs__(self):
return (self._tpu_categorical_column, self.shared_embedding_column_creator,
@ -598,7 +623,8 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn,
initializer=None,
shared_embedding_collection_name=None,
max_sequence_length=0,
learning_rate_fn=None):
learning_rate_fn=None,
use_safe_embedding_lookup=True):
_TPUBaseEmbeddingColumn.__init__(
self,

View File

@ -43,7 +43,7 @@ def _initialized_session():
return sess
class EmbeddingColumnTestV2(test.TestCase):
class EmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
def test_defaults(self):
categorical_column = fc_lib.categorical_column_with_identity(
@ -77,8 +77,16 @@ class EmbeddingColumnTestV2(test.TestCase):
'aaa': parsing_ops.VarLenFeature(dtypes.int64)
}, embedding_column._parse_example_spec)
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True,
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False,
})
@test_util.deprecated_graph_mode_only
def test_feature_layer_cpu(self):
def test_feature_layer_cpu(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
sparse_input = sparse_tensor.SparseTensorValue(
@ -135,12 +143,14 @@ class EmbeddingColumnTestV2(test.TestCase):
embedding_column = tpu_fc.embedding_column_v2(
categorical_column,
dimension=embedding_dimension,
initializer=_initializer)
initializer=_initializer,
use_safe_embedding_lookup=use_safe_embedding_lookup)
sequence_embedding_column = tpu_fc.embedding_column_v2(
sequence_categorical_column,
dimension=embedding_dimension,
initializer=_initializer,
max_sequence_length=2)
max_sequence_length=2,
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
features = {'aaa': sparse_input, 'bbb': sparse_input}
@ -160,6 +170,16 @@ class EmbeddingColumnTestV2(test.TestCase):
self.assertAllEqual(expected_lookups, embedding_lookup.eval())
self.assertAllEqual(expected_lookups_sequence,
sequence_embedding_lookup[0].eval())
# The graph will still have SparseFillEmptyRows due to sequence being
# a Rank3 embedding lookup.
if use_safe_embedding_lookup:
self.assertEqual(2, [
x.type for x in ops.get_default_graph().get_operations()
].count('SparseFillEmptyRows'))
else:
self.assertEqual(1, [
x.type for x in ops.get_default_graph().get_operations()
].count('SparseFillEmptyRows'))
def test_deepcopy(self):
categorical_column = fc_lib.categorical_column_with_identity(
@ -173,7 +193,7 @@ class EmbeddingColumnTestV2(test.TestCase):
embedding_column_copy._max_sequence_length)
class SharedEmbeddingColumnTestV2(test.TestCase):
class SharedEmbeddingColumnTestV2(test.TestCase, parameterized.TestCase):
@test_util.deprecated_graph_mode_only
def test_defaults(self):
@ -238,8 +258,16 @@ class SharedEmbeddingColumnTestV2(test.TestCase):
self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape)
self.assertEqual((embedding_dimension,), embedding_column_b.variable_shape)
@parameterized.named_parameters(
{
'testcase_name': 'use_safe_embedding_lookup',
'use_safe_embedding_lookup': True
}, {
'testcase_name': 'dont_use_safe_embedding_lookup',
'use_safe_embedding_lookup': False
})
@test_util.deprecated_graph_mode_only
def test_feature_layer_cpu(self):
def test_feature_layer_cpu(self, use_safe_embedding_lookup):
# Inputs.
vocabulary_size = 3
input_a = sparse_tensor.SparseTensorValue(
@ -296,7 +324,8 @@ class SharedEmbeddingColumnTestV2(test.TestCase):
[categorical_column_a, categorical_column_b],
dimension=embedding_dimension,
initializer=_initializer,
max_sequence_lengths=[0, 2])
max_sequence_lengths=[0, 2],
use_safe_embedding_lookup=use_safe_embedding_lookup)
# Provide sparse input and get dense result.
dense_features = fc_lib.DenseFeatures([embedding_column_a])
@ -315,6 +344,16 @@ class SharedEmbeddingColumnTestV2(test.TestCase):
self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
self.assertAllEqual(expected_lookups_b,
embedding_lookup_b[0].eval())
# The graph will still have SparseFillEmptyRows due to sequence being
# a Rank3 embedding lookup.
if use_safe_embedding_lookup:
self.assertEqual(2, [
x.type for x in ops.get_default_graph().get_operations()
].count('SparseFillEmptyRows'))
else:
self.assertEqual(1, [
x.type for x in ops.get_default_graph().get_operations()
].count('SparseFillEmptyRows'))
def test_deepcopy(self):
vocabulary_size = 3

View File

@ -26,7 +26,7 @@ tf_module {
}
member_method {
name: "embedding_column"
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
}
member_method {
name: "indicator_column"
@ -70,7 +70,7 @@ tf_module {
}
member_method {
name: "shared_embedding_columns"
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
}
member_method {
name: "weighted_categorical_column"

View File

@ -22,7 +22,7 @@ tf_module {
}
member_method {
name: "embedding_column"
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\'], "
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "
}
member_method {
name: "initialize_tpu_system"
@ -30,7 +30,7 @@ tf_module {
}
member_method {
name: "shared_embedding_columns"
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'max_sequence_lengths\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'max_sequence_lengths\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
}
member_method {
name: "shutdown_tpu_system"

View File

@ -26,7 +26,7 @@ tf_module {
}
member_method {
name: "embedding_column"
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
}
member_method {
name: "indicator_column"
@ -62,7 +62,7 @@ tf_module {
}
member_method {
name: "shared_embeddings"
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
}
member_method {
name: "weighted_categorical_column"