Clean up unused all reduce code

PiperOrigin-RevId: 329068527 Change-Id: I4aa13d38e03cf02c6c65ddab3d83b10f523053b4
2020-08-28 23:53:33 -07:00 · 2020-08-28 23:53:33 -07:00 · d98960155a
commit d98960155a
parent a61f55115c
3 changed files with 3 additions and 367 deletions
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@ -13,6 +13,7 @@ exports_files(["LICENSE"])
 py_library(
    name = "distribute_test_lib_pip",
    deps = [
+        ":all_reduce",
        ":combinations",
        ":multi_worker_test_base",
        ":single_loss_example",
@ -89,7 +90,6 @@ py_library(
    srcs = ["cross_device_utils.py"],
    srcs_version = "PY2AND3",
    deps = [
-        ":all_reduce",
        ":values",
        "//tensorflow/python:array_ops",
        "//tensorflow/python:collective_ops",
@ -141,6 +141,7 @@ py_library(
    ],
    srcs_version = "PY2AND3",
    deps = [
+        ":all_reduce",
        ":cross_device_ops",
        ":distribute_lib",
        ":mirrored_strategy",
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@ -18,16 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import collections as pycoll
 import copy
 import threading

-from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
@ -171,65 +168,6 @@ def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
    return (grad, v), None


-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-
-  Args:
-    devices: a list of canonical device strings.
-    group_size: integer which is equal to or greater than 1.
-
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size == 0 then each device will appear exactly once.
-
-  Raises:
-    ValueError: if group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError(
-        'only %d devices, but group_size=%d' % (num_devices, group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-
-
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-
-  Returns:
-    small_grads: Subset of device_grads where shape is <= threshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-
-
 # TODO(yuefengz): use random key starts to avoid reusing keys?
 class CollectiveKeys(object):
  """Class that manages collective keys.
@ -580,272 +518,6 @@ def build_collective_gather_indexed_slices(input_slices_list,
  return out_slices_list


-def sum_grad_and_var_all_reduce(grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  with ops.name_scope('allreduce'):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    scaled_grads = [g for g, _ in grad_and_vars]
-    if alg == 'nccl':
-      summed_grads = nccl_ops.all_sum(scaled_grads)
-    elif alg == 'xring':
-      summed_grads = all_reduce.build_ring_all_reduce(
-          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
-    elif alg == 'nccl/xring':
-      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                     math_ops.add)
-    elif alg == 'nccl/rechd':
-      summed_grads = all_reduce.build_nccl_then_recursive_hd(
-          scaled_grads, math_ops.add)
-    elif alg == 'nccl/pscpu':
-      summed_grads = all_reduce.build_nccl_then_shuffle(
-          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
-    elif alg == 'pscpu/pscpu':
-      second_gather_devices = aux_devices[:num_shards]
-      summed_grads = all_reduce.build_shuffle_then_shuffle(
-          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
-    elif alg in ['pscpu', 'psgpu']:
-      summed_grads = all_reduce.build_shuffle_all_reduce(
-          scaled_grads, aux_devices, math_ops.add_n)
-    else:
-      raise ValueError('unsupported all_reduce alg: ', alg)
-
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-
-
-def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
-                             num_shards, gpu_indices):
-  """Apply all-reduce algorithm over specified gradient tensors.
-
-  Args:
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    replica_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  # Auxiliary devices for hierarchical all-reduces.
-  aux_device_groups = group_device_names(
-      aux_devices, num_shards if alg_contains_shuffle else 1)
-  group_index = 0
-  reduced_gv_list = []
-  for grad_and_vars in zip(*replica_grads):
-    reduced_gv_list.append(
-        sum_grad_and_var_all_reduce(
-            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
-            if is_hierarchical else aux_device_groups[group_index], num_shards))
-    group_index = (group_index + 1) % len(aux_device_groups)
-  new_replica_grads = [list(x) for x in zip(*reduced_gv_list)]
-  return new_replica_grads
-
-
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists, it will be returned as multiple
-      ranges.
-
-  Returns:
-    (ranges, singles) where ranges is a list of [first, last] pairs of
-      consecutive elements in index_list, and singles is all of the
-      other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-
-
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-
-
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one replica.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with ops.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with ops.device(g.device):
-        members.append(array_ops.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with ops.device(members[0].device):
-      return array_ops.concat(members, 0)
-
-
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with ops.device(gv[0].device):
-    with ops.name_scope('unpack'):
-      splits = array_ops.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
-                            gpt.vars[idx]))
-  return unpacked_gv
-
-
-def pack_small_tensors(replica_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-
-  Args:
-    replica_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-
-  Returns:
-    new_replica_grads, packing where new_replica_grads is identical to
-      replica_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each replica, and packing contains
-      the data necessary to restore the replica_grads structure.
-
-  Look through the first replica for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-
-  Requires:
-    Every gv_list in replicas must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(replica_grads[0]):
-    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(replica_grads[0])
-  packing = {}
-  if small_ranges:
-    new_replica_grads = []
-    for dev_idx, gv_list in enumerate(replica_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_replica_grads.append(new_gv_list)
-    return new_replica_grads, packing
-  else:
-    return replica_grads, None
-
-
-def unpack_small_tensors(replica_grads, packing):
-  """Undo the structure alterations to replica_grads done by pack_small_tensors.
-
-  Args:
-    replica_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to replica_grads.
-
-  Returns:
-    new_replica_grads: identical to replica_grads except that concatenations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return replica_grads
-  new_replica_grads = []
-  num_devices = len(replica_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(replica_grads):
-    gv_list = list(gv_list)
-    new_gv_list = gv_list[num_packed:]
-    for i in range(num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_replica_grads.append(new_gv_list)
-  return new_replica_grads
-
-
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
  """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
  if any(isinstance(v, ops.IndexedSlices) for v in values):
@ -875,18 +547,6 @@ def copy_tensor_or_indexed_slices_to_device(value, device):
  return result


-def contains_indexed_slices(value):
-  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
-  if isinstance(value, ops.IndexedSlices):
-    return True
-  elif isinstance(value, (list, tuple)) and value:
-    return any(contains_indexed_slices(v) for v in value)
-  elif isinstance(value, value_lib.DistributedValues):
-    return contains_indexed_slices(value.values)
-  else:
-    return False
-
-
 def is_indexed_slices(value):
  if isinstance(value, ops.IndexedSlices):
    return True
--- a/tensorflow/python/distribute/cross_device_utils_test.py
+++ b/tensorflow/python/distribute/cross_device_utils_test.py
@ -81,32 +81,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
  def testIsIndexedSlices(self):
    t = math_ops._as_indexed_slices(
        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_List(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_Tuple(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerReplica(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_replica = value_lib.PerReplica((t0, t1))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
+    self.assertTrue(cross_device_utils.is_indexed_slices(t))

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],