Clean up unused all reduce code
PiperOrigin-RevId: 329068527 Change-Id: I4aa13d38e03cf02c6c65ddab3d83b10f523053b4
This commit is contained in:
parent
a61f55115c
commit
d98960155a
@ -13,6 +13,7 @@ exports_files(["LICENSE"])
|
|||||||
py_library(
|
py_library(
|
||||||
name = "distribute_test_lib_pip",
|
name = "distribute_test_lib_pip",
|
||||||
deps = [
|
deps = [
|
||||||
|
":all_reduce",
|
||||||
":combinations",
|
":combinations",
|
||||||
":multi_worker_test_base",
|
":multi_worker_test_base",
|
||||||
":single_loss_example",
|
":single_loss_example",
|
||||||
@ -89,7 +90,6 @@ py_library(
|
|||||||
srcs = ["cross_device_utils.py"],
|
srcs = ["cross_device_utils.py"],
|
||||||
srcs_version = "PY2AND3",
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
":all_reduce",
|
|
||||||
":values",
|
":values",
|
||||||
"//tensorflow/python:array_ops",
|
"//tensorflow/python:array_ops",
|
||||||
"//tensorflow/python:collective_ops",
|
"//tensorflow/python:collective_ops",
|
||||||
@ -141,6 +141,7 @@ py_library(
|
|||||||
],
|
],
|
||||||
srcs_version = "PY2AND3",
|
srcs_version = "PY2AND3",
|
||||||
deps = [
|
deps = [
|
||||||
|
":all_reduce",
|
||||||
":cross_device_ops",
|
":cross_device_ops",
|
||||||
":distribute_lib",
|
":distribute_lib",
|
||||||
":mirrored_strategy",
|
":mirrored_strategy",
|
||||||
|
@ -18,16 +18,13 @@ from __future__ import absolute_import
|
|||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import collections as pycoll
|
|
||||||
import copy
|
import copy
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
from tensorflow.python.distribute import all_reduce
|
|
||||||
from tensorflow.python.distribute import values as value_lib
|
from tensorflow.python.distribute import values as value_lib
|
||||||
from tensorflow.python.eager import backprop
|
from tensorflow.python.eager import backprop
|
||||||
from tensorflow.python.eager import context
|
from tensorflow.python.eager import context
|
||||||
from tensorflow.python.framework import device as pydev
|
from tensorflow.python.framework import device as pydev
|
||||||
from tensorflow.python.framework import dtypes
|
|
||||||
from tensorflow.python.framework import ops
|
from tensorflow.python.framework import ops
|
||||||
from tensorflow.python.ops import array_ops
|
from tensorflow.python.ops import array_ops
|
||||||
from tensorflow.python.ops import collective_ops
|
from tensorflow.python.ops import collective_ops
|
||||||
@ -171,65 +168,6 @@ def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
|
|||||||
return (grad, v), None
|
return (grad, v), None
|
||||||
|
|
||||||
|
|
||||||
def group_device_names(devices, group_size):
|
|
||||||
"""Group device names into groups of group_size.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
devices: a list of canonical device strings.
|
|
||||||
group_size: integer which is equal to or greater than 1.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list of lists of devices, where each inner list is group_size long,
|
|
||||||
and each device appears at least once in an inner list. If
|
|
||||||
len(devices) % group_size == 0 then each device will appear exactly once.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: if group_size > len(devices)
|
|
||||||
"""
|
|
||||||
num_devices = len(devices)
|
|
||||||
if group_size > num_devices:
|
|
||||||
raise ValueError(
|
|
||||||
'only %d devices, but group_size=%d' % (num_devices, group_size))
|
|
||||||
num_groups = (
|
|
||||||
num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
|
|
||||||
groups = [[] for i in range(num_groups)]
|
|
||||||
for i in range(num_groups * group_size):
|
|
||||||
groups[i % num_groups].append(devices[i % num_devices])
|
|
||||||
return groups
|
|
||||||
|
|
||||||
|
|
||||||
def split_grads_by_size(threshold_size, device_grads):
|
|
||||||
"""Break gradients into two sets according to tensor size.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
threshold_size: int size cutoff for small vs large tensor.
|
|
||||||
device_grads: List of lists of (gradient, variable) tuples. The outer
|
|
||||||
list is over devices. The inner list is over individual gradients.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
small_grads: Subset of device_grads where shape is <= threshold_size
|
|
||||||
elements.
|
|
||||||
large_grads: Subset of device_grads where shape is > threshold_size
|
|
||||||
elements.
|
|
||||||
"""
|
|
||||||
small_grads = []
|
|
||||||
large_grads = []
|
|
||||||
for dl in device_grads:
|
|
||||||
small_dl = []
|
|
||||||
large_dl = []
|
|
||||||
for (g, v) in dl:
|
|
||||||
tensor_size = g.get_shape().num_elements()
|
|
||||||
if tensor_size <= threshold_size:
|
|
||||||
small_dl.append([g, v])
|
|
||||||
else:
|
|
||||||
large_dl.append([g, v])
|
|
||||||
if small_dl:
|
|
||||||
small_grads.append(small_dl)
|
|
||||||
if large_dl:
|
|
||||||
large_grads.append(large_dl)
|
|
||||||
return small_grads, large_grads
|
|
||||||
|
|
||||||
|
|
||||||
# TODO(yuefengz): use random key starts to avoid reusing keys?
|
# TODO(yuefengz): use random key starts to avoid reusing keys?
|
||||||
class CollectiveKeys(object):
|
class CollectiveKeys(object):
|
||||||
"""Class that manages collective keys.
|
"""Class that manages collective keys.
|
||||||
@ -580,272 +518,6 @@ def build_collective_gather_indexed_slices(input_slices_list,
|
|||||||
return out_slices_list
|
return out_slices_list
|
||||||
|
|
||||||
|
|
||||||
def sum_grad_and_var_all_reduce(grad_and_vars,
|
|
||||||
num_workers,
|
|
||||||
alg,
|
|
||||||
gpu_indices,
|
|
||||||
aux_devices=None,
|
|
||||||
num_shards=1):
|
|
||||||
"""Apply all-reduce algorithm over specified gradient tensors."""
|
|
||||||
with ops.name_scope('allreduce'):
|
|
||||||
# Note that each grad_and_vars looks like the following:
|
|
||||||
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
|
|
||||||
scaled_grads = [g for g, _ in grad_and_vars]
|
|
||||||
if alg == 'nccl':
|
|
||||||
summed_grads = nccl_ops.all_sum(scaled_grads)
|
|
||||||
elif alg == 'xring':
|
|
||||||
summed_grads = all_reduce.build_ring_all_reduce(
|
|
||||||
scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
|
|
||||||
elif alg == 'nccl/xring':
|
|
||||||
summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
|
|
||||||
math_ops.add)
|
|
||||||
elif alg == 'nccl/rechd':
|
|
||||||
summed_grads = all_reduce.build_nccl_then_recursive_hd(
|
|
||||||
scaled_grads, math_ops.add)
|
|
||||||
elif alg == 'nccl/pscpu':
|
|
||||||
summed_grads = all_reduce.build_nccl_then_shuffle(
|
|
||||||
scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
|
|
||||||
elif alg == 'pscpu/pscpu':
|
|
||||||
second_gather_devices = aux_devices[:num_shards]
|
|
||||||
summed_grads = all_reduce.build_shuffle_then_shuffle(
|
|
||||||
scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
|
|
||||||
elif alg in ['pscpu', 'psgpu']:
|
|
||||||
summed_grads = all_reduce.build_shuffle_all_reduce(
|
|
||||||
scaled_grads, aux_devices, math_ops.add_n)
|
|
||||||
else:
|
|
||||||
raise ValueError('unsupported all_reduce alg: ', alg)
|
|
||||||
|
|
||||||
result = []
|
|
||||||
for (_, v), g in zip(grad_and_vars, summed_grads):
|
|
||||||
result.append([g, v])
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
|
|
||||||
num_shards, gpu_indices):
|
|
||||||
"""Apply all-reduce algorithm over specified gradient tensors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dev_prefixes: list of prefix strings to use to generate PS device names.
|
|
||||||
replica_grads: the gradients to reduce.
|
|
||||||
num_workers: number of worker processes across entire job.
|
|
||||||
alg: the all-reduce algorithm to apply.
|
|
||||||
num_shards: alg-specific sharding factor.
|
|
||||||
gpu_indices: indices of local GPUs in order usable for ring-reduce.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list of reduced tensors
|
|
||||||
"""
|
|
||||||
alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
|
|
||||||
is_hierarchical = '/' in alg
|
|
||||||
if 'pscpu' in alg:
|
|
||||||
aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
|
|
||||||
elif 'psgpu' in alg:
|
|
||||||
aux_devices = [
|
|
||||||
prefix + '/gpu:%d' % i
|
|
||||||
for i in range(len(gpu_indices))
|
|
||||||
for prefix in dev_prefixes
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
aux_devices = ['/job:localhost/cpu:0']
|
|
||||||
# Auxiliary devices for hierarchical all-reduces.
|
|
||||||
aux_device_groups = group_device_names(
|
|
||||||
aux_devices, num_shards if alg_contains_shuffle else 1)
|
|
||||||
group_index = 0
|
|
||||||
reduced_gv_list = []
|
|
||||||
for grad_and_vars in zip(*replica_grads):
|
|
||||||
reduced_gv_list.append(
|
|
||||||
sum_grad_and_var_all_reduce(
|
|
||||||
grad_and_vars, num_workers, alg, gpu_indices, aux_devices
|
|
||||||
if is_hierarchical else aux_device_groups[group_index], num_shards))
|
|
||||||
group_index = (group_index + 1) % len(aux_device_groups)
|
|
||||||
new_replica_grads = [list(x) for x in zip(*reduced_gv_list)]
|
|
||||||
return new_replica_grads
|
|
||||||
|
|
||||||
|
|
||||||
def extract_ranges(index_list, range_size_limit=32):
|
|
||||||
"""Extract consecutive ranges and singles from index_list.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
index_list: List of monotone increasing non-negative integers.
|
|
||||||
range_size_limit: Largest size range to return. If a larger
|
|
||||||
consecutive range exists, it will be returned as multiple
|
|
||||||
ranges.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ranges, singles) where ranges is a list of [first, last] pairs of
|
|
||||||
consecutive elements in index_list, and singles is all of the
|
|
||||||
other elements, in original order.
|
|
||||||
"""
|
|
||||||
if not index_list:
|
|
||||||
return [], []
|
|
||||||
first = index_list[0]
|
|
||||||
last = first
|
|
||||||
ranges = []
|
|
||||||
singles = []
|
|
||||||
for i in index_list[1:]:
|
|
||||||
if i == last + 1 and (last - first) <= range_size_limit:
|
|
||||||
last = i
|
|
||||||
else:
|
|
||||||
if last > first:
|
|
||||||
ranges.append([first, last])
|
|
||||||
else:
|
|
||||||
singles.append(first)
|
|
||||||
first = i
|
|
||||||
last = i
|
|
||||||
if last > first:
|
|
||||||
ranges.append([first, last])
|
|
||||||
else:
|
|
||||||
singles.append(first)
|
|
||||||
return ranges, singles
|
|
||||||
|
|
||||||
|
|
||||||
GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
|
|
||||||
|
|
||||||
|
|
||||||
def pack_range(key, packing, grad_vars, rng):
|
|
||||||
"""Form the concatenation of a specified range of gradient tensors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
key: Value under which to store meta-data in packing that will be used
|
|
||||||
later to restore the grad_var list structure.
|
|
||||||
packing: Dict holding data describing packed ranges of small tensors.
|
|
||||||
grad_vars: List of (grad, var) pairs for one replica.
|
|
||||||
rng: A pair of integers giving the first, last indices of a consecutive
|
|
||||||
range of tensors to be packed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A tensor that is the concatenation of all the specified small tensors.
|
|
||||||
"""
|
|
||||||
to_pack = grad_vars[rng[0]:rng[1] + 1]
|
|
||||||
members = []
|
|
||||||
variables = []
|
|
||||||
restore_shapes = []
|
|
||||||
with ops.name_scope('pack'):
|
|
||||||
for g, v in to_pack:
|
|
||||||
variables.append(v)
|
|
||||||
restore_shapes.append(g.shape)
|
|
||||||
with ops.device(g.device):
|
|
||||||
members.append(array_ops.reshape(g, [-1]))
|
|
||||||
packing[key] = GradPackTuple(
|
|
||||||
indices=range(rng[0], rng[1] + 1),
|
|
||||||
vars=variables,
|
|
||||||
shapes=restore_shapes)
|
|
||||||
with ops.device(members[0].device):
|
|
||||||
return array_ops.concat(members, 0)
|
|
||||||
|
|
||||||
|
|
||||||
def unpack_grad_tuple(gv, gpt):
|
|
||||||
"""Unpack a previously packed collection of gradient tensors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
gv: A (grad, var) pair to be unpacked.
|
|
||||||
gpt: A GradPackTuple describing the packing operation that produced gv.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of (grad, var) pairs corresponding to the values that were
|
|
||||||
originally packed into gv, maybe following subsequent operations like
|
|
||||||
reduction.
|
|
||||||
"""
|
|
||||||
elt_widths = [x.num_elements() for x in gpt.shapes]
|
|
||||||
with ops.device(gv[0].device):
|
|
||||||
with ops.name_scope('unpack'):
|
|
||||||
splits = array_ops.split(gv[0], elt_widths)
|
|
||||||
unpacked_gv = []
|
|
||||||
for idx, s in enumerate(splits):
|
|
||||||
unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
|
|
||||||
gpt.vars[idx]))
|
|
||||||
return unpacked_gv
|
|
||||||
|
|
||||||
|
|
||||||
def pack_small_tensors(replica_grads, max_bytes=0, max_group=0):
|
|
||||||
"""Concatenate small gradient tensors together for reduction.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
replica_grads: List of lists of (gradient, variable) tuples.
|
|
||||||
max_bytes: Int giving max number of bytes in a tensor that
|
|
||||||
may be considered small.
|
|
||||||
max_group: Int giving max number of small tensors that may be
|
|
||||||
concatenated into one new tensor.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
new_replica_grads, packing where new_replica_grads is identical to
|
|
||||||
replica_grads except that all feasible small_tensors have been removed
|
|
||||||
from their places and concatenated into larger tensors that are
|
|
||||||
now in the front of the list for each replica, and packing contains
|
|
||||||
the data necessary to restore the replica_grads structure.
|
|
||||||
|
|
||||||
Look through the first replica for gradients of the same type (float),
|
|
||||||
and small size, that are all sequential. For each such group,
|
|
||||||
replace by a new tensor that is a flattened concatenation. Note
|
|
||||||
that the corresponding variable will be absent, which doesn't matter
|
|
||||||
because it isn't used during all-reduce.
|
|
||||||
|
|
||||||
Requires:
|
|
||||||
Every gv_list in replicas must have isomorphic structure including identical
|
|
||||||
tensor sizes and types.
|
|
||||||
"""
|
|
||||||
small_indices = []
|
|
||||||
large_indices = []
|
|
||||||
for idx, (g, _) in enumerate(replica_grads[0]):
|
|
||||||
if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
|
|
||||||
small_indices.append(idx)
|
|
||||||
else:
|
|
||||||
large_indices.append(idx)
|
|
||||||
small_ranges, small_singles = extract_ranges(
|
|
||||||
small_indices, range_size_limit=max_group)
|
|
||||||
large_indices = sorted(large_indices + small_singles)
|
|
||||||
num_gv = len(replica_grads[0])
|
|
||||||
packing = {}
|
|
||||||
if small_ranges:
|
|
||||||
new_replica_grads = []
|
|
||||||
for dev_idx, gv_list in enumerate(replica_grads):
|
|
||||||
assert len(gv_list) == num_gv
|
|
||||||
new_gv_list = []
|
|
||||||
for r in small_ranges:
|
|
||||||
key = '%d:%d' % (dev_idx, len(new_gv_list))
|
|
||||||
new_gv_list.append((pack_range(key, packing, gv_list, r),
|
|
||||||
'packing_var_placeholder'))
|
|
||||||
for i in large_indices:
|
|
||||||
new_gv_list.append(gv_list[i])
|
|
||||||
new_replica_grads.append(new_gv_list)
|
|
||||||
return new_replica_grads, packing
|
|
||||||
else:
|
|
||||||
return replica_grads, None
|
|
||||||
|
|
||||||
|
|
||||||
def unpack_small_tensors(replica_grads, packing):
|
|
||||||
"""Undo the structure alterations to replica_grads done by pack_small_tensors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
replica_grads: List of List of (grad, var) tuples.
|
|
||||||
packing: A dict generated by pack_small_tensors describing the changes
|
|
||||||
it made to replica_grads.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
new_replica_grads: identical to replica_grads except that concatenations
|
|
||||||
of small tensors have been split apart and returned to their original
|
|
||||||
positions, paired with their original variables.
|
|
||||||
"""
|
|
||||||
if not packing:
|
|
||||||
return replica_grads
|
|
||||||
new_replica_grads = []
|
|
||||||
num_devices = len(replica_grads)
|
|
||||||
num_packed = len(packing.keys()) // num_devices
|
|
||||||
for dev_idx, gv_list in enumerate(replica_grads):
|
|
||||||
gv_list = list(gv_list)
|
|
||||||
new_gv_list = gv_list[num_packed:]
|
|
||||||
for i in range(num_packed):
|
|
||||||
k = '%d:%d' % (dev_idx, i)
|
|
||||||
gpt = packing[k]
|
|
||||||
gv = unpack_grad_tuple(gv_list[i], gpt)
|
|
||||||
for gi, idx in enumerate(gpt.indices):
|
|
||||||
assert idx == gpt.indices[gi]
|
|
||||||
new_gv_list.insert(idx, gv[gi])
|
|
||||||
new_replica_grads.append(new_gv_list)
|
|
||||||
return new_replica_grads
|
|
||||||
|
|
||||||
|
|
||||||
def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
|
def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
|
||||||
"""Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
|
"""Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
|
||||||
if any(isinstance(v, ops.IndexedSlices) for v in values):
|
if any(isinstance(v, ops.IndexedSlices) for v in values):
|
||||||
@ -875,18 +547,6 @@ def copy_tensor_or_indexed_slices_to_device(value, device):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def contains_indexed_slices(value):
|
|
||||||
"""Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
|
|
||||||
if isinstance(value, ops.IndexedSlices):
|
|
||||||
return True
|
|
||||||
elif isinstance(value, (list, tuple)) and value:
|
|
||||||
return any(contains_indexed_slices(v) for v in value)
|
|
||||||
elif isinstance(value, value_lib.DistributedValues):
|
|
||||||
return contains_indexed_slices(value.values)
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_indexed_slices(value):
|
def is_indexed_slices(value):
|
||||||
if isinstance(value, ops.IndexedSlices):
|
if isinstance(value, ops.IndexedSlices):
|
||||||
return True
|
return True
|
||||||
|
@ -81,32 +81,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
|
|||||||
def testIsIndexedSlices(self):
|
def testIsIndexedSlices(self):
|
||||||
t = math_ops._as_indexed_slices(
|
t = math_ops._as_indexed_slices(
|
||||||
constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
|
constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
|
||||||
self.assertTrue(cross_device_utils.contains_indexed_slices(t))
|
self.assertTrue(cross_device_utils.is_indexed_slices(t))
|
||||||
|
|
||||||
@test_util.run_in_graph_and_eager_modes
|
|
||||||
def testContainsIndexedSlices_List(self):
|
|
||||||
t0 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
|
|
||||||
t1 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
|
|
||||||
self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
|
|
||||||
|
|
||||||
@test_util.run_in_graph_and_eager_modes
|
|
||||||
def testContainsIndexedSlices_Tuple(self):
|
|
||||||
t0 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
|
|
||||||
t1 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
|
|
||||||
self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
|
|
||||||
|
|
||||||
@test_util.run_in_graph_and_eager_modes
|
|
||||||
def testContainsIndexedSlices_PerReplica(self):
|
|
||||||
t0 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
|
|
||||||
t1 = math_ops._as_indexed_slices(
|
|
||||||
constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
|
|
||||||
per_replica = value_lib.PerReplica((t0, t1))
|
|
||||||
self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
|
|
||||||
|
|
||||||
@combinations.generate(combinations.combine(
|
@combinations.generate(combinations.combine(
|
||||||
mode=["graph", "eager"],
|
mode=["graph", "eager"],
|
||||||
|
Loading…
Reference in New Issue
Block a user