From 471993acd6e05729b0ec19d2200d08f3ccacfd53 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Sat, 13 Apr 2019 19:58:00 -0700
Subject: [PATCH] tf.distribute.Strategy.reduce: Remove default value for axis
 argument for Strategy V2, but keep for Strategy V1. Change all callers to
 specify the argument, so their code continues to work with Strategy V2. For
 now most of the callers use axis=None. If this continues to remain the most
 common case, we can consider adding back the default in the future.

PiperOrigin-RevId: 243462195
---
 .../python/collective_all_reduce_strategy_test.py   |  2 +-
 tensorflow/python/distribute/distribute_lib.py      | 13 ++++++++++---
 tensorflow/python/distribute/distribute_lib_test.py |  4 ++--
 tensorflow/python/distribute/input_lib.py           |  6 ++++--
 tensorflow/python/distribute/minimize_loss_test.py  |  3 ++-
 .../python/distribute/mirrored_strategy_test.py     |  2 +-
 tensorflow/python/distribute/values.py              |  3 ++-
 .../keras/distribute/distributed_training_utils.py  |  2 +-
 .../python/keras/engine/training_distributed.py     |  3 ++-
 .../mixed_precision/experimental/loss_scale.py      |  2 +-
 .../tensorflow.distribute.-mirrored-strategy.pbtxt  |  2 +-
 ...tensorflow.distribute.-one-device-strategy.pbtxt |  2 +-
 .../golden/v2/tensorflow.distribute.-strategy.pbtxt |  2 +-
 ...ute.experimental.-central-storage-strategy.pbtxt |  2 +-
 ...perimental.-multi-worker-mirrored-strategy.pbtxt |  2 +-
 ...te.experimental.-parameter-server-strategy.pbtxt |  2 +-
 ...ow.distribute.experimental.-t-p-u-strategy.pbtxt |  2 +-
 17 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 1156187b971..b029bc44a6b 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -293,7 +293,7 @@ class CollectiveAllReduceStrategyTestBase(
         return array_ops.identity(x)
 
       x = distribution.extended.call_for_each_replica(model_fn)
-      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
+      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=None)
       x = distribution.experimental_local_results(x)[0]
 
       sess.run(variables.global_variables_initializer())
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 16dc12cc070..061d5bcca91 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -438,7 +438,7 @@ class Strategy(object):
     with self.scope():
       return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
 
-  def reduce(self, reduce_op, value, axis=None):
+  def reduce(self, reduce_op, value, axis):
     """Reduce `value` across replicas.
 
     Given a per-replica value returned by `experimental_run_v2`, say a
@@ -468,8 +468,10 @@ class Strategy(object):
         be combined.
       value: A "per replica" value, e.g. returned by `experimental_run_v2` to
         be combined into a single tensor.
-      axis: Optional. Specifies the dimension to reduce along within each
-        replica's tensor. Should typically be set to the batch dimension.
+      axis: Specifies the dimension to reduce along within each
+        replica's tensor. Should typically be set to the batch dimension, or
+        `None` to only reduce across replicas (e.g. if the tensor has no batch
+        dimension).
 
     Returns:
       A `Tensor`.
@@ -729,6 +731,11 @@ class StrategyV1(Strategy):
     return super(StrategyV1, self).experimental_run(
         fn, input_iterator)
 
+  def reduce(self, reduce_op, value, axis=None):
+    return super(StrategyV1, self).reduce(reduce_op, value, axis)
+
+  reduce.__doc__ = Strategy.reduce.__doc__
+
   def update_config_proto(self, config_proto):
     """Returns a copy of `config_proto` modified for use with this strategy.
 
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 451502da58c..5f12c634f52 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -321,7 +321,7 @@ class TestStrategyTest(test.TestCase):
   @_run_in_and_out_of_scope
   def testReduce(self, dist):
     x = constant_op.constant(1.)
-    x_r = dist.reduce(reduce_util.ReduceOp.MEAN, x)
+    x_r = dist.reduce(reduce_util.ReduceOp.MEAN, x, axis=None)
     self.assertEqual(self.evaluate(x), self.evaluate(x_r))
 
   def testReductions_acceptStringOps(self):
@@ -329,7 +329,7 @@ class TestStrategyTest(test.TestCase):
     for op in ("mean", "MEAN", "sum", "SUM"):
       x = constant_op.constant(1.)
       y = constant_op.constant(1.)
-      x_r = dist.reduce(op, x)
+      x_r = dist.reduce(op, x, axis=None)
       self.assertEqual(self.evaluate(x), self.evaluate(x_r))
       x_r = dist.extended.reduce_to(op, x, "/CPU:0")
       self.assertEqual(self.evaluate(x), self.evaluate(x_r))
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index be619f72486..e90c32f6666 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -804,11 +804,13 @@ class MultiStepContext(object):
         self._last_step_outputs[name] = output
       else:
         distribution = distribution_strategy_context.get_strategy()
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output,
+                                                            axis=None)
     else:
       assert reduce_op is not None
       def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value,
+                                                            axis=None)
         # Setting this inside the `merge_fn` because all replicas share the same
         # context object, so it's more robust to set it only once (even if all
         # the replicas are trying to set the same value).
diff --git a/tensorflow/python/distribute/minimize_loss_test.py b/tensorflow/python/distribute/minimize_loss_test.py
index ee514d6ee6d..6f6af500c53 100644
--- a/tensorflow/python/distribute/minimize_loss_test.py
+++ b/tensorflow/python/distribute/minimize_loss_test.py
@@ -514,7 +514,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     if not reduced:
       self.assertLen(distribution.experimental_local_results(loss_output),
                      distribution.num_replicas_in_sync)
-      loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output)
+      loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output,
+                                        axis=None)
     else:
       unwrapped_output = distribution.experimental_local_results(loss_output)
       self.assertLen(unwrapped_output, 1)
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 8686d4d253d..166dca5cada 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -103,7 +103,7 @@ class MirroredTwoDeviceDistributionTest(
   def testReduceToCpu(self, distribution):
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(_replica_id)
-      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result)
+      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result, axis=None)
       expected = sum(range(distribution.num_replicas_in_sync))
       self.assertEqual(expected, self.evaluate(reduced))
 
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 7012eef2194..4268416e2f7 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -1310,7 +1310,8 @@ class SyncOnReadVariable(DistributedVariable, PerReplica):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       return self.primary
     return self._distribute_strategy.reduce(
-        reduce_util.ReduceOp.from_variable_aggregation(self.aggregation), self)
+        reduce_util.ReduceOp.from_variable_aggregation(self.aggregation), self,
+        axis=None)
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index 43a4a10ac1b..cbd802736e3 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -107,7 +107,7 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
     loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
-                                        grouped_outputs[0])
+                                        grouped_outputs[0], axis=None)
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
     all_outputs = [loss] + all_outputs
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 93ebf5c9827..b45c645420c 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -497,7 +497,8 @@ def experimental_tpu_test_loop(model,
       # We reduce all other metrics using mean for now. This is temporary
       # workaround until new metrics are in place.
       reduce_op = ds_reduce_util.ReduceOp.MEAN
-    output_tensors[label] = current_strategy.reduce(reduce_op, output)
+    output_tensors[label] = current_strategy.reduce(reduce_op, output,
+                                                    axis=None)
   test_op = control_flow_ops.group(list(output_tensors.values()))
 
   if verbose >= 1:
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
index d686eabd849..e72983ee491 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale.py
@@ -289,7 +289,7 @@ class DynamicLossScale(LossScale):
       is_finite_float = distribution.extended.call_for_each_replica(
           get_is_finite, args=(grads,))
       reduced_is_finite_float = distribution.reduce(reduce_util.ReduceOp.SUM,
-                                                    is_finite_float)
+                                                    is_finite_float, axis=None)
       is_finite = math_ops.equal(reduced_is_finite_float,
                                  distribution.num_replicas_in_sync)
     else:
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index d4336e674c7..ca2d40eb7f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index f460ace8cbc..b86a87a07ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 941bc034063..52d8b0d040f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -52,7 +52,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index c0e2e8b2389..e1ae51ef1b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 3547b085a6f..42c35a8aa0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 230b8515114..a3049fca55a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index a2feb6191ab..31031c790af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -53,7 +53,7 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "scope"