From 0f8a32c6cc4213f2c7a1a755b5a95f236bfb3eed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 15 Dec 2020 20:09:11 -0800
Subject: [PATCH] Fix gradient computation bug in TPU embedding mid-level API.

PiperOrigin-RevId: 347744033
Change-Id: I685274217865fca9d5aa0f34bbbce618dcac5f13
---
 tensorflow/python/tpu/ops/tpu_ops.py          | 77 ++++++++-----------
 .../python/tpu/tpu_embedding_gradient.py      |  4 +-
 2 files changed, 36 insertions(+), 45 deletions(-)

diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
index 4ddf259ff1a..8facb1fdad7 100644
--- a/tensorflow/python/tpu/ops/tpu_ops.py
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+
 """Operations for TPUs."""
 
 from __future__ import absolute_import
@@ -56,8 +57,8 @@ def all_to_all(x,
     split_count: The number of splits, this number must equal to the sub-group
       size(group_assignment.get_shape()[1])
     group_assignment: Optional 2d int32 lists with shape [num_groups,
-      num_replicas_per_group]. `group_assignment[i]` represents the replica ids
-      in the ith subgroup.
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
     name: Optional op name.
 
   Returns:
@@ -96,8 +97,8 @@ def cross_replica_sum(x, group_assignment=None, name=None):
   Args:
     x: The local tensor to the sum.
     group_assignment: Optional 2d int32 lists with shape [num_groups,
-      num_replicas_per_group]. `group_assignment[i]` represents the replica ids
-      in the ith subgroup.
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
     name: Optional op name.
 
   Returns:
@@ -167,8 +168,8 @@ def _embedding_activations_grad(activations_op, grad_wrt_activations):
   g = ops.get_default_graph()
   table_id = activations_op.get_attr("table_id")
   lookup_id = activations_op.get_attr("lookup_id")
-  table_gradients = g.get_collection_ref("tpu_embedding_gradients_table_%d" %
-                                         table_id)
+  table_gradients = g.get_collection_ref(
+      "tpu_embedding_gradients_table_%d" % table_id)
 
   if not table_gradients:
     raise RuntimeError(
@@ -180,15 +181,6 @@ def _embedding_activations_grad(activations_op, grad_wrt_activations):
         "        train_op = opt.minimize(loss)\n"
         "\n")
 
-  if table_gradients[lookup_id] is not None:
-    raise RuntimeError(
-        "Duplicate gradients (w.r.t. TPUEmbedding activations) generated for "
-        "table_id {} and lookup_id {}. This happens when there are multiple "
-        "calls to tf.gradients in a graph containing TPU embeddings. "
-        "TF cannot identify which gradient to use for updating the embedding "
-        "variables. Consider placing tf.StopGradient around tensors where "
-        "variable update is not required.".format(table_id, lookup_id))
-
   table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
   return [
       # RegisterGradient requires that value be returned for all inputs. Since
@@ -230,10 +222,10 @@ def infeed_dequeue_tuple(dtypes, shapes, name=None):
   """A placeholder op for values fed into the TPU simultaneously as a tuple.
 
   Args:
-    dtypes: A list of `tf.DType`s that has length `>= 1`. The element types of
-      each element in `outputs`.
-    shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`). The
-      shapes of each tensor in `outputs`.
+    dtypes: A list of `tf.DType`s that has length `>= 1`.
+      The element types of each element in `outputs`.
+    shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+      The shapes of each tensor in `outputs`.
     name: A name for the operation (optional).
 
   Returns:
@@ -249,8 +241,6 @@ def infeed_dequeue_tuple(dtypes, shapes, name=None):
           "{} is not a supported TPU infeed type. Supported types are: "
           "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
   return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
-
-
 # pylint: enable=redefined-outer-name
 
 
@@ -263,18 +253,19 @@ def send_tpu_embedding_gradients(inputs,
 
   Args:
     inputs: A TensorList of gradients with which to update embedding tables.
-      This argument has the same length and shapes as the return value of
-      RecvTPUEmbeddingActivations, but contains gradients of the model's loss
-      with respect to the embedding activations. The embedding tables are
-      updated from these gradients via the optimizers specified in the TPU
-      embedding configuration given to tpu.initialize_system.
+        This argument has the same length and shapes as the return value of
+        RecvTPUEmbeddingActivations, but contains gradients of the model's
+        loss with respect to the embedding activations. The embedding tables
+        are updated from these gradients via the optimizers specified in the
+        TPU embedding configuration given to tpu.initialize_system.
     config: Serialized TPUEmbeddingConfiguration proto.
     learning_rates: A TensorList of float32 scalars, one for each dynamic
         learning rate tag: see the comments in
-          //third_party/tensorflow/core/protobuf/tpu/
-          optimization_parameters.proto. Multiple tables can share the same
-          dynamic learning rate tag as specified in the configuration. If the
-          learning rates for all tables are constant, this list should be empty.
+        //third_party/tensorflow/core/protobuf/tpu/
+                                             optimization_parameters.proto.
+        Multiple tables can share the same dynamic learning rate tag as
+        specified in the configuration. If the learning rates for all tables
+        are constant, this list should be empty.
     name: A name for the operation (optional).
 
   Returns:
@@ -336,8 +327,8 @@ def enqueue_tpu_embedding_sparse_batch(sample_indices,
   """A placeholder op for enqueueing embedding IDs to the TPU.
 
   Args:
-    sample_indices: A list of rank 1 Tensors specifying the training example and
-      feature to which the corresponding embedding_indices and
+    sample_indices: A list of rank 1 Tensors specifying the training example
+      and feature to which the corresponding embedding_indices and
       aggregation_weights values belong. sample_indices[i] must equal b * nf +
       f, where nf is the number of features from the corresponding table, f is
       in [0, nf), and b is in [0, batch size). Both int32 and int64 are allowed,
@@ -345,9 +336,9 @@ def enqueue_tpu_embedding_sparse_batch(sample_indices,
     embedding_indices: A list of rank 1 Tensors, indices into the embedding
       tables. Both int32 and int64 are allowed and will be converted to int32
       internally.
-    aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e.,
-      per (training example, feature) -- aggregation weights. Both float32 and
-      float64 are allowed and will be converted to float32 internally.
+    aggregation_weights: A list of rank 1 Tensors containing per sample --
+      i.e. per (training example, feature) -- aggregation weights. Both float32
+      and float64 are allowed and will be converted to float32 internally.
     device_ordinal: The TPU device to use. Should be >= 0 and less than the
       number of TPU cores in the task on which the node is placed.
     combiners: A list of string scalars, one for each embedding table that
@@ -395,20 +386,20 @@ def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
   """A placeholder op for enqueueing embedding IDs to the TPU.
 
   Args:
-    sample_indices: A list of rank 2 Tensors specifying the training example to
-      which the corresponding embedding_indices and aggregation_weights values
-      belong. It corresponds to sp_ids.indices in embedding_lookup_sparse(). If
-      the size of its first dimension is 0, we assume each embedding_indices
-      belongs to a different sample. Both int32 and int64 are allowed and will
-      be converted to int32 internally.
+    sample_indices: A list of rank 2 Tensors specifying the training example
+      to which the corresponding embedding_indices and aggregation_weights
+      values belong. It corresponds to sp_ids.indices in
+      embedding_lookup_sparse(). If the size of its first dimension is 0, we
+      assume each embedding_indices belongs to a different sample. Both int32
+      and int64 are allowed and will be converted to int32 internally.
     embedding_indices: A list of rank 1 Tensors, indices into the embedding
       tables. It corresponds to sp_ids.values in embedding_lookup_sparse(). Both
       int32 and int64 are allowed and will be converted to int32 internally.
     aggregation_weights: A list of rank 1 Tensors containing per training
       example aggregation weights. It corresponds to sp_weights.values in
       embedding_lookup_sparse(). If the size of its first dimension is 0, we
-      assume all weights are 1. Both float32 and float64 are allowed and will be
-      converted to float32 internally.
+      assume all weights are 1. Both float32 and float64 are allowed and will
+      be converted to float32 internally.
     table_ids: A list of integers specifying the identifier of the embedding
       table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
       lookup the corresponding input. The ith input is looked up using
diff --git a/tensorflow/python/tpu/tpu_embedding_gradient.py b/tensorflow/python/tpu/tpu_embedding_gradient.py
index 23ecf3eb006..ec727b8f141 100644
--- a/tensorflow/python/tpu/tpu_embedding_gradient.py
+++ b/tensorflow/python/tpu/tpu_embedding_gradient.py
@@ -91,8 +91,8 @@ def create_dummy_table_variables(tpu_embedding):
     if table_gradients:
       raise RuntimeError(
           'tpu_embedding_gradients_table_{} is not empty.'.format(table_id))
-    num_features = len(tpu_embedding.table_to_features_dict[table])
-    table_gradients.extend([None for _ in range(num_features)])
+    table_gradients.extend(
+        [None] * len(tpu_embedding.table_to_features_dict[table]))
 
   return (dummy_table_variables,
           variables.variables_initializer(