Add number_of_partitions to the InfeedQueue API to allow infeed to be processed with pure data parallelism, but the partition/resharding happens inside the model function.

PiperOrigin-RevId: 324063661 Change-Id: Ieba2ca2b0a5092cceea4b51e34fb1b30d539d579
2020-07-30 12:54:35 -07:00 · 2020-07-30 12:54:35 -07:00 · c1336e9a40
commit c1336e9a40
parent f999bb1785
3 changed files with 89 additions and 3 deletions
--- a/tensorflow/python/tpu/tpu_feed.py
+++ b/tensorflow/python/tpu/tpu_feed.py
@ -135,6 +135,7 @@ class InfeedQueue(object):
               tuple_types=None,
               tuple_shapes=None,
               shard_dimensions=None,
               number_of_partitions=None,
               name=None):
    """Creates a new InfeedQueue with the given configuration.
@ -150,6 +151,13 @@ class InfeedQueue(object):
      shard_dimensions: if not None, a list of dimensions on which the
        elements of the queue should be sharded during automatic
        parallelization.
      number_of_partitions: if > 1, the infeed dequeue shape will contain
        the full shape that includes all partitions and add corresponding XLA
        annotation on the infeed dequeue op. In this case, the infeed is still
        data parallel that feeds per-core batch size to each core while the XLA
        computation may be partitioned. As XLA requires infeed dequeue shape to
        be per-replica shape, thus we need number_of_partitions here to
        calculate the per-replica unpartitioned shape.
      name: the name of the queue.
    Raises:
@ -166,6 +174,10 @@ class InfeedQueue(object):
    self._generated_enqueue_ops = False
    self._generated_dequeue_op = False
    self._name = "InfeedQueue" if name is None else name
    if number_of_partitions is None:
      self._number_of_partitions = 1
    else:
      self._number_of_partitions = number_of_partitions
    if number_of_tuple_elements is None:
      if tuple_types is not None:
        number_of_tuple_elements = len(tuple_types)
@ -359,6 +371,7 @@ class InfeedQueue(object):
    """
    for policy in self._sharding_policies:
      policy.set_number_of_shards(number_of_shards)
      policy.set_number_of_partitions(self._number_of_partitions)
    self._validate()
  def set_configuration_from_input_tensors(self, input_tensors):
@ -485,16 +498,23 @@ class InfeedQueue(object):
    self._generated_dequeue_op = True
    full_name = "%s/dequeue" % self._name
    sharded_shapes = [
-        policy.get_sharded_shape(shape)
+        policy.get_unpartitioned_shape(policy.get_sharded_shape(shape))
        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
    ]
    if tpu_device is not None:
      with ops.device(tpu.core(tpu_device)):
-        return tpu_ops.infeed_dequeue_tuple(
+        dequeue_op = tpu_ops.infeed_dequeue_tuple(
            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
    else:
-      return tpu_ops.infeed_dequeue_tuple(
+      dequeue_op = tpu_ops.infeed_dequeue_tuple(
          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
    if self._number_of_partitions <= 1:
      return dequeue_op
    partitions = [
        policy.get_unpartitioned_shape([1] * shape.ndims).as_list()
        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
    ]
    return tag_sharding_attribute_for_dequeued_tensors(dequeue_op, partitions)
  def _generate_enqueue_op(self,
                           inputs,
--- a/tensorflow/python/tpu/tpu_sharding.py
+++ b/tensorflow/python/tpu/tpu_sharding.py
@ -34,6 +34,7 @@ class ShardingPolicy(object):
  def __init__(self):
    self._number_of_shards = None
    self._number_of_partitions = 1
    self._shard_dimension = None
    self._frozen = False
@ -92,6 +93,32 @@ class ShardingPolicy(object):
            "Can't set sharding policy to use %s shards; value must be >0" %
            str(number_of_shards))
  @property
  def number_of_partitions(self):
    """Returns the number of partitions of the policy or None if unspecified."""
    return self._number_of_partitions
  def set_number_of_partitions(self, number_of_partitions):
    """Sets the number of partitions for the current policy.
    If the policy has been frozen then shard_dimension must match the
    existing setting.
    Args:
      number_of_partitions: The number of partitions to use in the policy.
    Raises:
      ValueError: If the policy has been frozen and shard_dimension
        differs from the frozen value.
    """
    if self._frozen:
      if self._number_of_partitions != number_of_partitions:
        raise ValueError(
            "Can't set number_of_partitions to %d since it has been frozen to "
            "use %d." % (number_of_partitions, self._number_of_partitions))
    else:
      self._number_of_partitions = number_of_partitions
  @property
  def shard_dimension(self):
    """Returns the shard dimension of the policy or None if unspecified."""
@ -134,6 +161,34 @@ class ShardingPolicy(object):
    if other.shard_dimension is not None:
      self.set_shard_dimension(other.shard_dimension)
  def get_unpartitioned_shape(self, shape):
    """Returns the shape of an unpartitioned Tensor.
    When given the shape of a 'sharded-size' Tensor, returns the shape
    of the full shape of its unpartitioned Tensor.
    Args:
      shape: The shape of the sharded Tensor.
    Returns:
      The shape of the unpartitioned version of the Tensor.
    Raises:
      ValueError: if shape has unknown sharded dimension
    """
    shape = tensor_shape.as_shape(shape)
    dims = shape.as_list()
    if (self._shard_dimension is None or self._number_of_partitions is None or
        not dims):
      return None
    if dims[self._shard_dimension] is None:
      raise ValueError("shape %s must have a fixed size for dimension %d "
                       "that is known at graph construction time." %
                       (shape.as_list(), self._shard_dimension))
    if self._number_of_partitions > 1:
      dims[self._shard_dimension] *= self._number_of_partitions
    return tensor_shape.as_shape(dims)
  def get_sharded_shape(self, shape, shard_index=None):
    """Returns the shape of a shard of a full Tensor.
--- a/tensorflow/python/tpu/tpu_sharding_test.py
+++ b/tensorflow/python/tpu/tpu_sharding_test.py
@ -107,6 +107,17 @@ class ShardingTest(test.TestCase):
    with self.assertRaises(ValueError):
      _ = p.get_sharded_shape([4, 10], shard_index=-1)
  def testGetUnpartitionedShape(self):
    """Tests getting a sharded shape."""
    p = tpu_sharding.ShardingPolicy()
    p.set_number_of_shards(3)
    p.set_shard_dimension(1)
    p.set_number_of_partitions(4)
    self.assertEqual(p.get_unpartitioned_shape([3, 5]), [3, 20])
    p.freeze()
    with self.assertRaises(ValueError):
      _ = p.get_unpartitioned_shape([3, None])
  def testGetUnshardedShape(self):
    """Tests getting an unsharded shape."""
    p = tpu_sharding.ShardingPolicy()