180 lines
7.3 KiB
Python
180 lines
7.3 KiB
Python
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ===================================================================
|
|
"""Optional helper for gradient handling."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import collections
|
|
|
|
from tensorflow.python.framework import dtypes
|
|
from tensorflow.python.framework import ops
|
|
from tensorflow.python.ops import array_ops
|
|
from tensorflow.python.ops import variable_scope
|
|
from tensorflow.python.ops import variables
|
|
from tensorflow.python.platform import tf_logging as logging
|
|
from tensorflow.python.tpu.ops import tpu_ops
|
|
|
|
|
|
def get_gradients_through_compute_gradients(optimizer, loss, activations):
|
|
"""Compute gradients to send to TPU embedding.
|
|
|
|
Args:
|
|
optimizer: a subclass of optimizer.Optimizer, usually CrossShardOptimizer.
|
|
Used to call compute_gradients().
|
|
loss: a Tensor to call optimizer.compute_gradients() on.
|
|
activations: an OrderedDict mapping feature_name to Tensors of activations.
|
|
|
|
Returns:
|
|
An OrderedDict mapping from feature name Strings to Tensors of gradients of
|
|
the loss wrt the activations of the features.
|
|
"""
|
|
activation_list = activations.values()
|
|
grads_and_vars = optimizer.compute_gradients(loss, activation_list)
|
|
grads = [grad for grad, _ in grads_and_vars]
|
|
feature_to_gradient_dict = collections.OrderedDict(
|
|
zip(activations.keys(), grads))
|
|
return feature_to_gradient_dict
|
|
|
|
|
|
def create_dummy_table_variables(tpu_embedding):
|
|
"""Create dummy embedding table variables.
|
|
|
|
The sole purpose of these dummy variables are to trigger gradient
|
|
calculation wrt them so that the gradients wrt activation can be captured
|
|
and later sent to TPU embedding.
|
|
|
|
Args:
|
|
tpu_embedding: TPUEmbedding, dummy table variables will be created for use
|
|
with tpu_embedding.
|
|
|
|
Returns:
|
|
A tuple of dummy variables and their initializer.
|
|
|
|
Raises:
|
|
RuntimeError: if collection to store gradients already exists and is not
|
|
empty.
|
|
"""
|
|
dummy_table_variables = collections.OrderedDict()
|
|
for table_id, table in enumerate(tpu_embedding.table_to_features_dict):
|
|
dummy_table_variables[table] = (
|
|
# Explicitly specifying collections prevents this variable from
|
|
# being added to the GLOBAL_VARIABLES collection, so that Saver()
|
|
# ignores it.
|
|
# But Tensorflow optimizer creates slot variable for these dummy
|
|
# variable, e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1},
|
|
# which will be in GLOBAL_VARIABLES collection,
|
|
variable_scope.get_variable(
|
|
'tpu_embedding_dummy_table_variable_{}'.format(table),
|
|
dtype=dtypes.float32,
|
|
shape=[1],
|
|
use_resource=True,
|
|
trainable=True,
|
|
collections=['tpu_embedding_dummy_table_variables']))
|
|
|
|
g = ops.get_default_graph()
|
|
table_gradients = g.get_collection_ref(
|
|
'tpu_embedding_gradients_table_{}'.format(table_id))
|
|
if table_gradients:
|
|
raise RuntimeError(
|
|
'tpu_embedding_gradients_table_{} is not empty.'.format(table_id))
|
|
table_gradients.extend(
|
|
[None] * len(tpu_embedding.table_to_features_dict[table]))
|
|
|
|
return (dummy_table_variables,
|
|
variables.variables_initializer(
|
|
dummy_table_variables.values(),
|
|
name='tpu_embedding_dummy_table_variables_init'))
|
|
|
|
|
|
def hook_dummy_table_variables_to_activations(tpu_embedding, activations,
|
|
dummy_table_variables):
|
|
"""Have activations depend on dummy table variables for gradient intercept.
|
|
|
|
Args:
|
|
tpu_embedding: TPUEmbedding, activations and dummy_table_variables are from
|
|
tpu_embedding.
|
|
activations: An OrderedDict of feature name String to activation tensors.
|
|
dummy_table_variables: An OrderedDict of table name String to dummy table
|
|
variables.
|
|
|
|
Returns:
|
|
An OrderedDict of feature name String to activation tensors, which can be
|
|
used just as the activations input.
|
|
"""
|
|
new_activations = collections.OrderedDict()
|
|
for feature in activations:
|
|
table = tpu_embedding.feature_to_config_dict[feature].table_id
|
|
new_activations[feature] = tpu_ops.tpu_embedding_activations(
|
|
dummy_table_variables[table],
|
|
activations[feature],
|
|
table_id=list(tpu_embedding.table_to_config_dict).index(table),
|
|
lookup_id=tpu_embedding.table_to_features_dict[table].index(feature))
|
|
return new_activations
|
|
|
|
|
|
def get_gradients_through_dummy_table_variables(tpu_embedding):
|
|
"""Get gradients wrt the activations of each feature.
|
|
|
|
Args:
|
|
tpu_embedding: TPUEmbedding, create dummy table variable to be used with
|
|
tpu_embedding.
|
|
|
|
Returns:
|
|
An OrderedDict mapping feature name to gradient.
|
|
|
|
Raises:
|
|
ValueError: if some gradients are not defined.
|
|
"""
|
|
g = ops.get_default_graph()
|
|
feature_to_gradient_dict = collections.OrderedDict()
|
|
for table_id, table in enumerate(tpu_embedding.table_to_config_dict):
|
|
table_gradients = g.get_collection(
|
|
'tpu_embedding_gradients_table_{}'.format(table_id))
|
|
if all(gradient is None for gradient in table_gradients):
|
|
raise ValueError(
|
|
'Table {} with id {} has undefined gradients: this is probably '
|
|
'because the model asked TPUEmbedding to compute activations that '
|
|
'were not used.'.format(table, table_id))
|
|
if any(gradient is None for gradient in table_gradients):
|
|
# TODO(bfontain): create a white-list for optimizers which are compatible
|
|
# with `tf.stop_gradient`.
|
|
logging.warn(
|
|
'Table {} with id {} has undefined gradients: this is probably '
|
|
'because the model asked TPUEmbedding to compute activations that '
|
|
'were not used, or tf.stop_gradient() is applied. Gradients of zeros '
|
|
'are sent back to TPUEmbedding instead. Gradients of zeros and no '
|
|
'gradients are equivalent for SGD, AdaGrad, FTRL, momentum, etc, but '
|
|
'might differ for other optimizers due to implementation of tpu '
|
|
'embedding optimizers.'.format(table, table_id))
|
|
for feature, gradient in zip(tpu_embedding.table_to_features_dict[table],
|
|
table_gradients):
|
|
if gradient is not None:
|
|
feature_to_gradient_dict[feature] = gradient
|
|
else:
|
|
dimension = tpu_embedding.table_to_config_dict[table].dimension
|
|
batch_size = tpu_embedding.batch_size_per_core
|
|
max_sequence_length = (
|
|
tpu_embedding.feature_to_config_dict[feature].max_sequence_length)
|
|
if max_sequence_length:
|
|
feature_to_gradient_dict[feature] = array_ops.zeros(
|
|
[batch_size, max_sequence_length, dimension])
|
|
else:
|
|
feature_to_gradient_dict[feature] = array_ops.zeros(
|
|
[batch_size, dimension])
|
|
|
|
return feature_to_gradient_dict
|