Splits backprop.py in two files, one of which can be converted to C

PiperOrigin-RevId: 171165855
2017-10-05 09:50:49 -07:00 · 2017-10-05 09:50:49 -07:00 · 5f97262ae6
commit 5f97262ae6
parent 7e7d55c0f5
6 changed files with 335 additions and 286 deletions
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@ -339,7 +339,9 @@ py_library(
    srcs_version = "PY2AND3",
    visibility = ["//tensorflow:internal"],
    deps = [
        ":imperative_grad",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:constant_op",
        "//tensorflow/python:dtypes",
        "//tensorflow/python:errors",
        "//tensorflow/python:framework_ops",
@ -425,3 +427,9 @@ filegroup(
    ),
    visibility = ["//tensorflow:__subpackages__"],
 )
 py_library(
    name = "imperative_grad",
    srcs = ["imperative_grad.py"],
    deps = [":tape"],
 )
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 import functools
 import operator
 import threading
@ -28,6 +27,7 @@ import six
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@ -36,288 +36,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 # If over MIN_AGGREGATE_COUNT gradients are accumulated and the total
 # memory consumption is over MIN_AGGREGATE_BYTES, do an early aggregation
 # so as to release the gradient tensor to save memory.
 _MIN_AGGREGATE_COUNT = 4
 _MIN_AGGREGATE_BYTES = 128 * 1024 * 1024
 # Terminology:
 #
 #  - op: a possibly composite operation, which has an entry in the tape
 #  - target: dy in dx/dy
 #  - source: dx in dx/dy
 #  - tensor: one of the many inputs or outputs of an operation
 #
 # Below here we do the gradient algorithm. It works as follows:
 #
 # First we filter the tape to just the subset of operations we want to
 # differentiate. In the process of doing so we count how many times each Tensor
 # is used as an input to an op (so we know when we're done computing gradients
 # for that Tensor). We also count, for each tape entry, how many of its output
 # Tensors need gradients to be computed (Tensors which are not used do not need
 # any gradients to be computed).
 #
 # Finally, we start a backprop stack with a set of tape entries for which we
 # have all gradients available. This set usually is a subset of the set of
 # targets (not all since targets which have outputs in the tape will not have
 # gradients available initially).
 #
 # Then we repeatedly pop an entry from the stack, run its backprop, and update
 # the gradients of its inputs. Once we have computed all gradients for a single
 # input we can mark this input as done, and this can trigger adding an entry to
 # the stack if all outputs of that entry are now done.
 #
 # When the stack is empty we have gradients for all tensors we're interested in.
 def _prepare_backprop(target, tensor_to_op, op_to_entry, id_sources):
  """Filters the tape to only include relevant entries and counts tensor usages.
  Args:
    target: the target to optimize.
    tensor_to_op: Map from tensor id to key in op_to_entry that produced it.
    op_to_entry: Map from op id to a tape.TapeEntry object
    id_sources: the ids of the sources wrt the gradient is being taken.
  Returns:
    usage counts (how many entries downstream from a tensor use it)
    op_to_entry_map: entry map (a filtered tape, with only the relevant
     entries),
    missing: map from tensor id to how many downstream gradients still need
     to be computed before this tensor's gradient can be computed.
  """
  if isinstance(target, (ops.Tensor)):
    tensor_stack = [ops.tensor_id(target)]
  else:
    tensor_stack = list([ops.tensor_id(x) for x in target])
  tensor_usage_counts = {}
  o_to_e = {}  # Copy of just the bits we need from op_to_entry
  while tensor_stack:
    t = tensor_stack.pop()
    op = tensor_to_op.get(t, None)
    # op is None if the tensor is a source (i.e. was watched directly)
    if op is None or op in o_to_e:
      continue
    op_trace = op_to_entry[op]
    o_to_e[op] = op_trace
    for it in op_trace.input_ids:
      if it in tensor_usage_counts:
        tensor_usage_counts[it] += 1
      else:
        tensor_usage_counts[it] = 1
        if it not in id_sources and it in tensor_to_op:
          tensor_stack.append(it)
  op_missing_tensor_counts = collections.defaultdict(int)
  for t in tensor_usage_counts:
    if t in tensor_to_op and tensor_to_op[t] is not None:
      op_missing_tensor_counts[tensor_to_op[t]] += 1
  return tensor_usage_counts, o_to_e, op_missing_tensor_counts
 def _initialize_backprop_stack(op_to_entry, op_missing_tensor):
  """Returns the set of tape entries which are available for backprop."""
  ready_ops = []
  for op in op_to_entry:
    if op not in op_missing_tensor:
      ready_ops.append(op)
  return ready_ops
 def _initial_gradients(target, output_gradients, tensor_usage_counts):
  """Computes the initial gradients for each Tensor."""
  # Initialize the backprop stack
  gradients = collections.defaultdict(list)
  if isinstance(target, ops.Tensor):
    if output_gradients is not None:
      output_gradient = output_gradients
    else:
      output_gradient = array_ops.ones_like(target)
    gradients[ops.tensor_id(target)].append(output_gradient)
  else:
    for i, t in enumerate(target):
      if ops.tensor_id(t) in tensor_usage_counts:
        # Can't provide a gradient of something we're trying to differentiate
        assert output_gradients is None or output_gradients[i] is None
      else:
        if output_gradients is None or output_gradients[i] is None:
          out_grad = array_ops.ones_like(t)
        else:
          out_grad = output_gradients[i]
        gradients[ops.tensor_id(t)].append(out_grad)
  return gradients
@tf_contextlib.contextmanager
 def _no_op():
  yield
 def _aggregate_grads(gradients):
  """Aggregate gradients from multiple sources.
  Args:
    gradients: A list of 'Tensor' or 'IndexedSlices' gradients.
  Returns:
    If 'gradients' only has 'Tensor', returns an aggregated 'Tensor'.
    Otherwise returns an aggregated 'IndexedSlices'.
  """
  assert gradients, "No gradients to aggregate"
  if len(gradients) == 1:
    return gradients[0]
  if all([isinstance(g, ops.Tensor) for g in gradients]):
    return math_ops.add_n(gradients)
  else:
    assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in gradients])
    indexed_slices_list = []
    for grad in gradients:
      # TODO(xpan): Support nested IndexedSlices and core IndexedSlices
      if isinstance(grad, ops.Tensor):
        indexed_slices = ops.IndexedSlices(
            grad,
            constant_op.constant(range(grad.shape[0])),
            constant_op.constant(grad.shape.as_list()))
        indexed_slices_list.append(indexed_slices)
      else:
        indexed_slices_list.append(grad)
    # Dense shapes from all gradients should be the same.
    dense_shape = indexed_slices_list[0].dense_shape
    # For simplicity now, always cast to int64.
    indices = array_ops.concat([math_ops.cast(x.indices, dtypes.int64)
                                for x in indexed_slices_list], 0)
    values = array_ops.concat([x.values for x in indexed_slices_list], 0)
    return ops.IndexedSlices(values, indices, dense_shape)
 def _add_new_grads(gradients, gradients_size, tid, grad):
  """Adds a new gradient and maybe aggregate the gradients.
  Args:
    gradients: A dict map from tensor id to list of gradients.
    gradients_size: A dict map from tensor id to its total units. Might
       not be initialized.
    tid: Tensor id.
    grad: New gradient for the `tid`, either a Tensor or IndexedSlices.
  Raises:
    ValueError: if `grad` is neight Tensor nor IndexedSlices.
  """
  tensor_grads = gradients[tid]
  tensor_grads.append(grad)
  if len(tensor_grads) < _MIN_AGGREGATE_COUNT:
    return
  elif tid not in gradients_size:
    if isinstance(grad, ops.Tensor):
      size = functools.reduce(operator.mul, grad._shape_tuple(), 1)  # pylint: disable=protected-access
    elif isinstance(grad, ops.IndexedSlices):
      size = functools.reduce(operator.mul, grad.values._shape_tuple(), 1)  # pylint: disable=protected-access
    else:
      raise ValueError("Unexpected gradient type: %s" % type(grad))
    gradients_size[tid] = size
  else:
    size = gradients_size[tid]
  # For simplicity, assume each element to be 4 bytes now.
  if len(tensor_grads) * size * 4 > _MIN_AGGREGATE_BYTES:
    gradients[tid] = [_aggregate_grads(tensor_grads)]
 def imperative_grad(
    target,
    sources,
    output_gradients=None):
  """Computes gradients from the imperatively defined tape on top of the stack.
  Works by filtering the tape, computing how many downstream usages are of each
  tensor and entry, and repeatedly applying backward functions until we have
  gradients for all sources.
  Args:
   target: either a Tensor or list of Tensors to be differentiated.
   sources: list of Tensors for which we want gradients
   output_gradients: if not None, a list of gradient provided for each Target,
    or None if we are to use the target's computed downstream gradient.
  Returns:
   the gradient wrt each of the sources.
  Raises:
    RuntimeError: if something goes wrong.
    ValueError: if there is no sequence of differentiable operations connecting
     a source and any target Tensor. This can happen either if the target is
     not computed based on the source, if the tracing was set up incorrectly,
     or if only non-differentiable functions of the source were used in the
     computation of target.
  """
  if not tape._tape_stack.stack:  # pylint: disable=protected-access
    raise RuntimeError("Computing a gradient with no tape present")
  bp_tape = tape.pop_tape()
  tensor_to_op, op_to_entry = bp_tape.export()
  # This overwrites the op_to_entry variable, which will release all memory used
  # to keep traces that are irrelevant to the gradient computation we're doing
  # here.
  id_sources = [ops.tensor_id(t) for t in sources]
  tensor_usage_counts, op_to_entry, op_missing_tensor = _prepare_backprop(
      target, tensor_to_op, op_to_entry, id_sources)
  ready_ops = _initialize_backprop_stack(op_to_entry, op_missing_tensor)
  gradients = _initial_gradients(target, output_gradients,
                                 tensor_usage_counts)
  gradients_size = dict()
  # Now exhaust the backprop stack
  while ready_ops:
    op = ready_ops.pop()
    op_trace = op_to_entry.pop(op)
    out_gradients = [gradients.pop(t, None) for t in op_trace.output_ids]
    for i in range(len(out_gradients)):
      if out_gradients[i] is None:
        # TODO(apassos) this should be in the right device
        none_indices = _grad_fn_accepts_none_for_indices.get(
            op_trace.op_type, None)
        if none_indices is None or i not in none_indices:
          out_gradients[i] = array_ops.zeros(
              *op_trace.output_shape_and_dtype[i])
      else:
        out_gradients[i] = _aggregate_grads(out_gradients[i])
    in_gradients = op_trace.backward_function(
        *(out_gradients + op_trace.side_outputs))
    in_gradients = ([in_gradients]
                    if isinstance(in_gradients, (ops.Tensor,
                                                 ops.IndexedSlices,
                                                 type(None)))
                    else in_gradients)
    for i, t in enumerate(op_trace.input_ids):
      if in_gradients[i] is not None:
        _add_new_grads(gradients, gradients_size, t, in_gradients[i])
      if tensor_usage_counts.get(t, 0) > 0:
        tensor_usage_counts[t] -= 1
        if (t in tensor_to_op
            and tensor_usage_counts[t] == 0
            and t not in id_sources):
          in_op = tensor_to_op[t]
          if in_op is None:
            continue
          if op_missing_tensor.get(in_op, 0) > 0:
            op_missing_tensor[in_op] -= 1
            if op_missing_tensor.get(in_op, 0) == 0:
              ready_ops.append(in_op)
  result = []
  for i, s in enumerate(sources):
    g = gradients.get(ops.tensor_id(s), None)
    if g is None:
      result.append(None)
    else:
      result.append(_aggregate_grads(g))
  return result
 _op_attr_type_cache = {}
@ -557,7 +279,7 @@ def _record_gradient(op_name, inputs, attrs, results, name):
    if _tracing:
      print("Gradient for", (name if name else op_name), "inputs", op_inputs,
            "output_grads", orig_outputs, "gradients", result)
-    return result
+    return nest.flatten(result)
  tape.record_operation(op_name, results, inputs, [], grad_fn)
  if _tracing:
@ -615,7 +337,9 @@ def implicit_val_and_grad(f):
    end_node = f(*args)
    variables = tape.top_tape_watched_variables()
    sources = [x.handle for x in variables]
-    grad = imperative_grad(end_node, sources)
+    grad = imperative_grad.imperative_grad(_default_vspace,
                                           nest.flatten(end_node),
                                           sources)
    return end_node, list(zip(grad, variables))
  return grad_fn
@ -849,6 +573,96 @@ def val_and_grad_function(f, params=None):
      sources.append(args[i])
      tape.watch(args[i])
    result = f(*args)
-    return result, imperative_grad(result, sources, output_gradients=dy)
+    return result, imperative_grad.imperative_grad(
        _default_vspace, nest.flatten(result), sources,
        output_gradients=nest.flatten(dy) if dy is not None else None)
  return decorated
 def _aggregate_grads(gradients):
  """Aggregate gradients from multiple sources.
  Args:
    gradients: A list of 'Tensor' or 'IndexedSlices' gradients.
  Returns:
    If 'gradients' only has 'Tensor', returns an aggregated 'Tensor'.
    Otherwise returns an aggregated 'IndexedSlices'.
  """
  assert gradients, "No gradients to aggregate"
  if len(gradients) == 1:
    return gradients[0]
  if all([isinstance(g, ops.Tensor) for g in gradients]):
    return math_ops.add_n(gradients)
  else:
    assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in gradients])
    indexed_slices_list = []
    for grad in gradients:
      # TODO(xpan): Support nested IndexedSlices and core IndexedSlices
      if isinstance(grad, ops.Tensor):
        indexed_slices = ops.IndexedSlices(
            grad,
            constant_op.constant(range(grad.shape[0])),
            constant_op.constant(grad.shape.as_list()))
        indexed_slices_list.append(indexed_slices)
      else:
        indexed_slices_list.append(grad)
    # Dense shapes from all gradients should be the same.
    dense_shape = indexed_slices_list[0].dense_shape
    # For simplicity now, always cast to int64.
    indices = array_ops.concat([math_ops.cast(x.indices, dtypes.int64)
                                for x in indexed_slices_list], 0)
    values = array_ops.concat([x.values for x in indexed_slices_list], 0)
    return ops.IndexedSlices(values, indices, dense_shape)
 # If over MIN_AGGREGATE_COUNT gradients are accumulated and the total
 # memory consumption is over MIN_AGGREGATE_BYTES, do an early aggregation
 # so as to release the gradient tensor to save memory.
 _MIN_AGGREGATE_COUNT = 4
 _MIN_AGGREGATE_BYTES = 128 * 1024 * 1024
 def _add_new_grads(gradients, gradients_size, tid, grad):
  """Adds a new gradient and maybe aggregate the gradients.
  Args:
    gradients: A dict map from tensor id to list of gradients.
    gradients_size: A dict map from tensor id to its total units. Might
       not be initialized.
    tid: Tensor id.
    grad: New gradient for the `tid`, either a Tensor or IndexedSlices.
  Raises:
    ValueError: if `grad` is neight Tensor nor IndexedSlices.
  """
  tensor_grads = gradients[tid]
  tensor_grads.append(grad)
  if len(tensor_grads) < _MIN_AGGREGATE_COUNT:
    return
  elif tid not in gradients_size:
    if isinstance(grad, ops.Tensor):
      size = functools.reduce(operator.mul, grad._shape_tuple(), 1)  # pylint: disable=protected-access
    elif isinstance(grad, ops.IndexedSlices):
      size = functools.reduce(operator.mul, grad.values._shape_tuple(), 1)  # pylint: disable=protected-access
    else:
      raise ValueError("Unexpected gradient type: %s" % type(grad))
    gradients_size[tid] = size
  else:
    size = gradients_size[tid]
  # For simplicity, assume each element to be 4 bytes now.
  if len(tensor_grads) * size * 4 > _MIN_AGGREGATE_BYTES:
    gradients[tid] = [_aggregate_grads(tensor_grads)]
 _default_vspace = imperative_grad.VSpace(
    add_new_grads_fn=_add_new_grads,
    aggregate_fn=_aggregate_grads,
    tensor_id=ops.tensor_id,
    zeros=array_ops.zeros,
    ones_like=array_ops.ones_like)
--- a/tensorflow/python/eager/custom_gradient.py
+++ b/tensorflow/python/eager/custom_gradient.py
@ -78,7 +78,7 @@ def custom_gradient(f):
    # second derivative this way if they capture any output tensors. Change the
    # signature of custom_gradient.
    def actual_grad_fn(*outputs):
-      return grad_fn(*outputs)
+      return nest.flatten(grad_fn(*outputs))
    flat_result = nest.flatten(result)
    tape.record_operation(
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@ -88,7 +88,7 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
  else:
    captured_value = captured_value[1]
  tape.record_operation("captured_value", [captured_value], [value], [],
-                        lambda x: x)
+                        lambda x: [x])
  return captured_value
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@ -0,0 +1,227 @@
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Code for backpropagation using the tape utilities."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import collections
 from tensorflow.python.eager import tape
 # Terminology:
 #
 #  - op: a possibly composite operation, which has an entry in the tape
 #  - target: dy in dx/dy
 #  - source: dx in dx/dy
 #  - tensor: one of the many inputs or outputs of an operation
 #
 # Below here we do the gradient algorithm. It works as follows:
 #
 # First we filter the tape to just the subset of operations we want to
 # differentiate. In the process of doing so we count how many times each Tensor
 # is used as an input to an op (so we know when we're done computing gradients
 # for that Tensor). We also count, for each tape entry, how many of its output
 # Tensors need gradients to be computed (Tensors which are not used do not need
 # any gradients to be computed).
 #
 # Finally, we start a backprop stack with a set of tape entries for which we
 # have all gradients available. This set usually is a subset of the set of
 # targets (not all since targets which have outputs in the tape will not have
 # gradients available initially).
 #
 # Then we repeatedly pop an entry from the stack, run its backprop, and update
 # the gradients of its inputs. Once we have computed all gradients for a single
 # input we can mark this input as done, and this can trigger adding an entry to
 # the stack if all outputs of that entry are now done.
 #
 # When the stack is empty we have gradients for all tensors we're interested in.
 def _prepare_backprop(vspace, target, tensor_to_op, op_to_entry, id_sources):
  """Filters the tape to only include relevant entries and counts tensor usages.
  Args:
    vspace: information about the space we're differentiating in.
    target: the target to optimize.
    tensor_to_op: Map from tensor id to key in op_to_entry that produced it.
    op_to_entry: Map from op id to a tape.TapeEntry object
    id_sources: the ids of the sources wrt the gradient is being taken.
  Returns:
    usage counts (how many entries downstream from a tensor use it)
    op_to_entry_map: entry map (a filtered tape, with only the relevant
     entries),
    missing: map from tensor id to how many downstream gradients still need
     to be computed before this tensor's gradient can be computed.
  """
  tensor_stack = [vspace.tensor_id(x) for x in target]
  tensor_usage_counts = {}
  o_to_e = {}  # Copy of just the bits we need from op_to_entry
  while tensor_stack:
    t = tensor_stack.pop()
    op = tensor_to_op.get(t, None)
    # op is None if the tensor is a source (i.e. was watched directly)
    if op is None or op in o_to_e:
      continue
    op_trace = op_to_entry[op]
    o_to_e[op] = op_trace
    for it in op_trace.input_ids:
      if it in tensor_usage_counts:
        tensor_usage_counts[it] += 1
      else:
        tensor_usage_counts[it] = 1
        if it not in id_sources and it in tensor_to_op:
          tensor_stack.append(it)
  op_missing_tensor_counts = collections.defaultdict(int)
  for t in tensor_usage_counts:
    if t in tensor_to_op and tensor_to_op[t] is not None:
      op_missing_tensor_counts[tensor_to_op[t]] += 1
  return tensor_usage_counts, o_to_e, op_missing_tensor_counts
 def _initialize_backprop_stack(op_to_entry, op_missing_tensor):
  """Returns the set of tape entries which are available for backprop."""
  ready_ops = []
  for op in op_to_entry:
    if op not in op_missing_tensor:
      ready_ops.append(op)
  return ready_ops
 def _initial_gradients(vspace, target, output_gradients, tensor_usage_counts):
  """Computes the initial gradients for each Tensor."""
  # Initialize the backprop stack
  gradients = collections.defaultdict(list)
  for i, t in enumerate(target):
    if vspace.tensor_id(t) in tensor_usage_counts:
      # Can't provide a gradient of something we're trying to differentiate
      assert output_gradients is None or output_gradients[i] is None
    else:
      if output_gradients is None or output_gradients[i] is None:
        out_grad = vspace.ones_like(t)
      else:
        out_grad = output_gradients[i]
      gradients[vspace.tensor_id(t)].append(out_grad)
  return gradients
 VSpace = collections.namedtuple(
    "VSpace",
    ["add_new_grads_fn", "aggregate_fn", "tensor_id", "zeros", "ones_like"])
 def imperative_grad(
    vspace,
    target,
    sources,
    output_gradients=None):
  """Computes gradients from the imperatively defined tape on top of the stack.
  Works by filtering the tape, computing how many downstream usages are of each
  tensor and entry, and repeatedly applying backward functions until we have
  gradients for all sources.
  Args:
   vspace: the vector space in which to differentiate.
   target: either a Tensor or list of Tensors to be differentiated.
   sources: list of Tensors for which we want gradients
   output_gradients: if not None, a list of gradient provided for each Target,
    or None if we are to use the target's computed downstream gradient.
  Returns:
   the gradient wrt each of the sources.
  Raises:
    RuntimeError: if something goes wrong.
    ValueError: if there is no sequence of differentiable operations connecting
     a source and any target Tensor. This can happen either if the target is
     not computed based on the source, if the tracing was set up incorrectly,
     or if only non-differentiable functions of the source were used in the
     computation of target.
  """
  if not tape._tape_stack.stack:  # pylint: disable=protected-access
    raise RuntimeError("Computing a gradient with no tape present")
  bp_tape = tape.pop_tape()
  tensor_to_op, op_to_entry = bp_tape.export()
  # This overwrites the op_to_entry variable, which will release all memory used
  # to keep traces that are irrelevant to the gradient computation we're doing
  # here.
  id_sources = [vspace.tensor_id(t) for t in sources]
  tensor_usage_counts, op_to_entry, op_missing_tensor = _prepare_backprop(
      vspace, target, tensor_to_op, op_to_entry, id_sources)
  ready_ops = _initialize_backprop_stack(op_to_entry, op_missing_tensor)
  gradients = _initial_gradients(vspace, target, output_gradients,
                                 tensor_usage_counts)
  gradients_size = dict()
  # Now exhaust the backprop stack
  while ready_ops:
    op = ready_ops.pop()
    op_trace = op_to_entry.pop(op)
    out_gradients = [gradients.pop(t, None) for t in op_trace.output_ids]
    for i in range(len(out_gradients)):
      if out_gradients[i] is None:
        # TODO(apassos) this should be in the right device
        none_indices = _grad_fn_accepts_none_for_indices.get(
            op_trace.op_type, None)
        if none_indices is None or i not in none_indices:
          out_gradients[i] = vspace.zeros(
              *op_trace.output_shape_and_dtype[i])
      else:
        out_gradients[i] = vspace.aggregate_fn(out_gradients[i])
    in_gradients = op_trace.backward_function(
        *(out_gradients + op_trace.side_outputs))
    for i, t in enumerate(op_trace.input_ids):
      if in_gradients[i] is not None:
        vspace.add_new_grads_fn(gradients, gradients_size, t, in_gradients[i])
      if tensor_usage_counts.get(t, 0) > 0:
        tensor_usage_counts[t] -= 1
        if (t in tensor_to_op
            and tensor_usage_counts[t] == 0
            and t not in id_sources):
          in_op = tensor_to_op[t]
          if in_op is None:
            continue
          if op_missing_tensor.get(in_op, 0) > 0:
            op_missing_tensor[in_op] -= 1
            if op_missing_tensor.get(in_op, 0) == 0:
              ready_ops.append(in_op)
  result = []
  for i, s in enumerate(sources):
    g = gradients.get(vspace.tensor_id(s), None)
    if g is None:
      result.append(None)
    else:
      result.append(vspace.aggregate_fn(g))
  return result
 # TODO(agarwal): use an automatic mechanism for handling None arguments to
 # gradient functions.
 # Some gradient functions can accept None arguments for gradients. The following
 # maps the operation name to the indices at which the corresponding gradient
 # function can accept None values.
 # e.g. FusedBatchNorm outputs 5 values and hence receives 5 gradient values
 # during backprop. However the gradient function uses only the first of those
 # values and ignores the rest. The entry, "FusedBatchNorm": [1, 2, 3, 4],
 # indicates that only the gradient corresponding to index 0 is used, and the
 # gradient values at indices 1-4 are ignored (and hence can be None). The
 # backprop algorithm can then leverage this by not constructing zeros to
 # pass for those indices.
 _grad_fn_accepts_none_for_indices = {
    "SoftmaxCrossEntropyWithLogits": [1],
    "FusedBatchNorm": [1, 2, 3, 4]
 }
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@ -675,7 +675,7 @@ class _EagerTensorBase(Tensor):
    if not context.in_graph_mode():
      self_device = self.device
      def grad_fun(dresult):
-        return dresult._copy(device_name=self_device)
+        return [dresult._copy(device_name=self_device)]
      tape.record_operation("_copy", [new_tensor], [self], [], grad_fun)
    return new_tensor
    # pylint: enable=protected-access