Split ParameterServerStrategy into multi-worker and local version.

PiperOrigin-RevId: 243347187
2019-04-12 15:46:08 -07:00 · 2019-04-12 15:46:08 -07:00 · fec0d5fd20
commit fec0d5fd20
parent f47745d60a
20 changed files with 362 additions and 111 deletions
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@ -56,10 +56,12 @@ cuda_py_test(
    name = "parameter_server_strategy_test",
    srcs = ["parameter_server_strategy_test.py"],
    additional_deps = [
+        ":parameter_server_strategy",
+        "//tensorflow/python/distribute:central_storage_strategy",
        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:parameter_server_strategy",
        "//tensorflow/python/distribute:strategy_combinations",
        "//tensorflow/python/distribute:multi_worker_test_base",
-        ":parameter_server_strategy",
        "//tensorflow/python/distribute:strategy_test_lib",
        "@absl_py//absl/testing:parameterized",
        "//tensorflow/core:protos_all_py",
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@ -24,9 +24,9 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import parameter_server_strategy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
@ -52,7 +52,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
-from tensorflow.python.training.server_lib import ClusterSpec

 CHIEF = run_config.TaskType.CHIEF
 WORKER = run_config.TaskType.WORKER
@ -66,15 +65,6 @@ def _get_replica_id_integer():
  return replica_id


-class MockCoreParameterServerStrategy(distribute_lib.StrategyV1):
-  """Mock the strategy to allow cluster resolver as an argument."""
-
-  def __init__(self, cluster_resolver):
-    super(MockCoreParameterServerStrategy, self).__init__(
-        core_parameter_server_strategy.ParameterServerStrategyExtended(
-            self, cluster_resolver=cluster_resolver))
-
-
 def create_test_objects(cluster_spec=None,
                        task_type=None,
                        task_id=None,
@ -91,13 +81,15 @@ def create_test_objects(cluster_spec=None,
          task_type=task_type,
          task_id=task_id,
          num_accelerators={'GPU': num_gpus})
+      distribution = core_parameter_server_strategy.ParameterServerStrategy(
+          cluster_resolver)
      target = 'grpc://' + cluster_spec[WORKER][task_id]
    else:
-      cluster_resolver = SimpleClusterResolver(
-          ClusterSpec({}), num_accelerators={'GPU': num_gpus})
+      distribution = (
+          central_storage_strategy.CentralStorageStrategy._from_num_gpus(
+              num_gpus))
      target = ''

-    distribution = MockCoreParameterServerStrategy(cluster_resolver)
    sess_config = copy.deepcopy(sess_config)
    sess_config = distribution.update_config_proto(sess_config)
  else:
@ -440,7 +432,8 @@ class ParameterServerStrategyTestBase(
      x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
      train_op = d.group(train_op)

-      if context.num_gpus() < d.extended._num_gpus_per_worker:
+      if context.num_gpus() < sum(
+          1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
        return True

      if task_id == 0:
@ -536,7 +529,8 @@ class ParameterServerStrategyTestBase(

      before_out, after_out = step()

-      if context.num_gpus() < d.extended._num_gpus_per_worker:
+      if context.num_gpus() < sum(
+          1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
        return True

      if (not task_type or
@ -778,9 +772,11 @@ class ParameterServerStrategyTest(
      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
  def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
    strategy, _, _ = create_test_objects(
-        num_gpus=2, use_core_strategy=use_core_strategy)
-    strategy.configure(
-        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=1,
+        num_gpus=2,
+        use_core_strategy=use_core_strategy)

    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])

@ -923,8 +919,8 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
      strategy.extended.call_for_each_replica(f)


-class LocalParameterServerStrategyTest(strategy_test_lib.DistributionTestBase,
-                                       parameterized.TestCase):
+class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase,
+                                 parameterized.TestCase):

  @combinations.generate(combinations.combine(mode=['graph', 'eager'],
                                              use_core_strategy=[True, False],
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@ -276,6 +276,18 @@ py_library(
    ],
 )

+py_library(
+    name = "central_storage_strategy",
+    srcs = ["central_storage_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":parameter_server_strategy",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
    name = "one_device_strategy",
    srcs = ["one_device_strategy.py"],
@ -531,11 +543,11 @@ py_library(
    srcs = ["strategy_combinations.py"],
    srcs_version = "PY2AND3",
    deps = [
+        ":central_storage_strategy",
        ":combinations",
        ":distribute_lib",
        ":mirrored_strategy",
        ":one_device_strategy",
-        ":parameter_server_strategy",
        ":tpu_strategy",
        "//tensorflow/python:framework_ops",
        "//tensorflow/python:training",
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@ -0,0 +1,66 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("distribute.experimental.CentralStorageStrategy", v1=[])
+class CentralStorageStrategy(distribute_lib.Strategy):
+  """A one-machine strategy that puts all variables on a single device.
+
+  Variables are assigned to local CPU or the only GPU. If there is more
+  than one GPU, compute operations (other than variable update operations)
+  will be replicated across all GPUs.
+
+  Args:
+    compute_devices: an optional list of strings for device to replicate models
+      on. If this is not provided, all local GPUs will be used; if there is no
+      GPU, local CPU will be used.
+    parameter_device: an optional device string for which device to put
+      variables on. The default one is CPU or GPU if there is only one.
+  """
+
+  def __init__(self, compute_devices=None, parameter_device=None):
+    extended = parameter_server_strategy.ParameterServerStrategyExtended(
+        self,
+        compute_devices=compute_devices,
+        parameter_device=parameter_device)
+    super(CentralStorageStrategy, self).__init__(extended)
+
+  @classmethod
+  def _from_num_gpus(cls, num_gpus):
+    return cls(device_util.local_devices_from_num_gpus(num_gpus))
+
+
+@tf_export(v1=["distribute.experimental.CentralStorageStrategy"])
+class CentralStorageStrategyV1(distribute_lib.StrategyV1):
+
+  __doc__ = CentralStorageStrategy.__doc__
+
+  def __init__(self, compute_devices=None, parameter_device=None):
+    """Initializes this strategy with default TFConfigClusterResolver."""
+    super(CentralStorageStrategyV1, self).__init__(
+        parameter_server_strategy.ParameterServerStrategyExtended(
+            self,
+            compute_devices=compute_devices,
+            parameter_device=parameter_device))
--- a/tensorflow/python/distribute/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@ -108,3 +108,9 @@ def get_host_for_device(device):
  return tf_device.DeviceSpec(
      job=spec.job, replica=spec.replica, task=spec.task,
      device_type="CPU", device_index=0).to_string()
+
+
+def local_devices_from_num_gpus(num_gpus):
+  """Returns device strings for local GPUs or CPU."""
+  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
+          ("/device:CPU:0",))
--- a/tensorflow/python/distribute/experimental/BUILD
+++ b/tensorflow/python/distribute/experimental/BUILD
@ -13,6 +13,7 @@ py_library(
    ],
    srcs_version = "PY2AND3",
    deps = [
+        "//tensorflow/python/distribute:central_storage_strategy",
        "//tensorflow/python/distribute:collective_all_reduce_strategy",
        "//tensorflow/python/distribute:parameter_server_strategy",
        "//tensorflow/python/distribute:tpu_strategy",
--- a/tensorflow/python/distribute/experimental/init.py
+++ b/tensorflow/python/distribute/experimental/init.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 # pylint: disable=unused-import
+from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import tpu_strategy
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@ -407,8 +407,7 @@ def _infer_num_gpus_per_worker(devices):
 def all_local_devices(num_gpus=None):
  if num_gpus is None:
    num_gpus = context.num_gpus()
-  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
-          ("/device:CPU:0",))
+  return device_util.local_devices_from_num_gpus(num_gpus)


 def _all_devices():
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@ -43,32 +43,32 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export

 _LOCAL_CPU = "/device:CPU:0"
-_LOCAL_GPU_0 = "/device:GPU:0"


 # TODO(yuefengz): maybe cache variables on local CPU.
@tf_export("distribute.experimental.ParameterServerStrategy", v1=[])
 class ParameterServerStrategy(distribute_lib.Strategy):
-  """A parameter server DistributionStrategy.
+  """An asynchronous multi-worker parameter server DistributionStrategy.
+
+  This strategy requires two jobs: workers and parameter servers.  Variables and
+  updates to those variables will be assigned to parameter servers and other
+  operations are assigned to workers.

-  This strategy class works for both local training and between-graph replicated
-  training for multiple workers. It uses `TFConfigClusterResolver` to detect
-  configurations for multi-worker training. In multi-worker training mode, i.e.
-  `TFConfigClusterResolver` has detected 'TF_CONFIG' environment variable and
-  'TF_CONFIG' has a cluster spec, variables and updates to those variables are
-  assigned to parameter servers and other operations are assigned to workers.
-  In local training mode, variables are assigned to local CPU or the only GPU.
  When each worker has more than one GPU, operations will be replicated on these
-  GPUs. In both cases, operations are replicated but variables are not and these
-  workers share a common view for which parameter server a variable is assigned
+  GPUs. Even though operations may be replicated, variables are not and each
+  worker shares a common view for which parameter server a variable is assigned
  to.

-  This class assumes between-graph replication will be used and works on a graph
-  for a particular worker. Note that each graph and worker is independent.
-  This means that while each worker will synchronously compute a single gradient
-  update across all GPUs, updates between workers proceed asynchronously.
-  Operations that occur only on the first replica (such as incrementing the
-  global step), will occur on the first replica *of every worker*.
+  By default it uses `TFConfigClusterResolver` to detect configurations for
+  multi-worker training. This requires a 'TF_CONFIG' environment variable and
+  the 'TF_CONFIG' must have a cluster spec.
+
+  This class assumes each worker is running the same code independently, but
+  parameter servers are running a standard server. This means that while each
+  worker will synchronously compute a single gradient update across all GPUs,
+  updates between workers proceed asynchronously. Operations that occur only on
+  the first replica (such as incrementing the global step), will occur on the
+  first replica *of every worker*.

  It is expected to call `call_for_each_replica(fn, ...)` for any
  operations which potentially can be replicated across replicas (i.e. multiple
@ -86,10 +86,21 @@ class ParameterServerStrategy(distribute_lib.Strategy):
  possibly create conflicts of device assignment.
  """

-  def __init__(self):
-    """Initializes this strategy with default TFConfigClusterResolver."""
-    super(ParameterServerStrategy, self).__init__(
-        ParameterServerStrategyExtended(self))
+  def __init__(self, cluster_resolver=None):
+    """Initializes this strategy.
+
+    Args:
+      cluster_resolver: Optional
+        `tf.distribute.cluster_resolver.ClusterResolver` object. Defaults to a
+        `tf.distribute.cluster_resolver.TFConfigClusterResolver`.
+    """
+    if cluster_resolver is None:
+      cluster_resolver = TFConfigClusterResolver()
+    if not cluster_resolver.cluster_spec():
+      raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
+    extended = ParameterServerStrategyExtended(
+        self, cluster_resolver=cluster_resolver)
+    super(ParameterServerStrategy, self).__init__(extended)


@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])
@ -97,31 +108,41 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1):

  __doc__ = ParameterServerStrategy.__doc__

-  def __init__(self):
-    """Initializes this strategy with default TFConfigClusterResolver."""
+  def __init__(self, cluster_resolver=None):
+    """Initializes this strategy."""
    super(ParameterServerStrategyV1, self).__init__(
-        ParameterServerStrategyExtended(self))
+        ParameterServerStrategyExtended(
+            self, cluster_resolver=cluster_resolver))


 # TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
 class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
-  """Implementation of ParameterServerStrategy."""
+  """Implementation of ParameterServerStrategy and CentralStorageStrategy."""

  def __init__(self,
               container_strategy,
-               cluster_resolver=TFConfigClusterResolver()):
+               cluster_resolver=None,
+               compute_devices=None,
+               parameter_device=None):
    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
-    self._initialize_strategy(cluster_resolver)
+    self._initialize_strategy(
+        cluster_resolver=cluster_resolver,
+        compute_devices=compute_devices,
+        parameter_device=parameter_device)

    # We typically don't need to do all-reduce in this strategy.
    self._cross_device_ops = (
        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))

-  def _initialize_strategy(self, cluster_resolver):
-    if cluster_resolver.cluster_spec().as_dict():
+  def _initialize_strategy(self,
+                           cluster_resolver=None,
+                           compute_devices=None,
+                           parameter_device=None):
+    if cluster_resolver and cluster_resolver.cluster_spec():
      self._initialize_multi_worker(cluster_resolver)
    else:
-      self._initialize_local(cluster_resolver)
+      self._initialize_local(
+          compute_devices, parameter_device, cluster_resolver=cluster_resolver)

  def _initialize_multi_worker(self, cluster_resolver):
    """Initialize devices for multiple workers.
@ -214,43 +235,41 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
        num_ps_replicas, self._is_chief, self._device_map,
        self._variable_device)

-  def _initialize_local(self, cluster_resolver):
+  # TODO(yuefengz): get rid of cluster_resolver argument when contrib's
+  # version no longer depends on this class.
+  def _initialize_local(self,
+                        compute_devices,
+                        parameter_device,
+                        cluster_resolver=None):
    """Initialize internal devices for local training."""
    worker_device = device_util.canonicalize("/device:CPU:0")
    self._input_host_device = numpy_dataset.SingleDevice(worker_device)

-    # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
-    # some cases.
-    if isinstance(cluster_resolver, TFConfigClusterResolver):
-      num_gpus = context.num_gpus()
-    else:
-      num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+    if compute_devices is None:
+      if not cluster_resolver:
+        num_gpus = context.num_gpus()
+      else:
+        num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
+        # Save the num_gpus_per_worker for configure method which is used by the
+        # contrib version.
+        self._num_gpus_per_worker = num_gpus

-    # Save the num_gpus_per_worker for configure method.
-    self._num_gpus_per_worker = num_gpus
+      compute_devices = device_util.local_devices_from_num_gpus(num_gpus)

-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus > 0:
-      compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
-    else:
-      compute_devices = (_LOCAL_CPU,)
+    if parameter_device is None:
+      # If there is only one GPU, put everything on that GPU. Otherwise, place
+      # variables on CPU.
+      if len(compute_devices) == 1:
+        parameter_device = compute_devices[0]
+      else:
+        parameter_device = _LOCAL_CPU

    self._device_map = values.ReplicaDeviceMap(compute_devices)
    self._input_workers = input_lib.InputWorkers(
        self._device_map, [(worker_device, compute_devices)])

-    # If there is only one GPU, put everything on that GPU. Otherwise, place
-    # variables on CPU.
-    if num_gpus == 1:
-      assert len(compute_devices) == 1
-      self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = (_LOCAL_GPU_0,)
-    else:
-      self._variable_device = _LOCAL_CPU
-      self._parameter_devices = (_LOCAL_CPU,)
-
+    self._variable_device = parameter_device
+    self._parameter_devices = (parameter_device,)
    self._is_chief = True
    self._cluster_spec = None
    self._task_type = None
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
@ -100,6 +101,10 @@ mirrored_strategy_with_two_gpus = combinations.NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
    required_gpus=2)
+central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
+    "CentralStorage2GPUs",
+    lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2),  # pylint: disable=protected-access
+    required_gpus=2)

 gradient_descent_optimizer_v1_fn = combinations.NamedObject(
    "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@ -23,7 +23,6 @@ from absl.testing import parameterized
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
@ -40,16 +39,6 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils


-# TODO(rchao): Merge parameter_server_strategy_with_two_gpus into
-# third_party/tensorflow/python/distribute/strategy_combinations.py
-# pylint: disable=g-long-lambda
-parameter_server_strategy_with_two_gpus = combinations.NamedDistribution(
-    "ParameterServer2GPUs",
-    lambda: parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2),
-    required_gpus=2)
-
-
 class DistributedValuesTest(test.TestCase):

  def testGetEager(self):
@ -561,7 +550,9 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):

  @combinations.generate(
      combinations.combine(
-          distribution=[parameter_server_strategy_with_two_gpus],
+          distribution=[
+              strategy_combinations.central_storage_strategy_with_two_gpus
+          ],
          mode=["graph", "eager"]))
  def testAssignOutOfScope_aggregating(self, distribution):
    with distribution.scope():
@ -577,7 +568,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
              strategy_combinations.mirrored_strategy_with_one_cpu,
              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
              strategy_combinations.tpu_strategy,
-              parameter_server_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
          ],
          mode=["graph", "eager"]))
  def testExtendsVariable(self, distribution):
@ -591,7 +582,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
              strategy_combinations.mirrored_strategy_with_one_cpu,
              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
              strategy_combinations.tpu_strategy,
-              parameter_server_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
          ],
          mode=["graph", "eager"]))
  def testCheckpointing(self, distribution):
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@ -318,10 +318,12 @@ py_library(
        "//tensorflow/python/distribute:combinations",
        "//tensorflow/python/distribute:distribute_config",
        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_lib",
        "//tensorflow/python/distribute:mirrored_strategy",
        "//tensorflow/python/distribute:multi_worker_test_base",
        "//tensorflow/python/distribute:parameter_server_strategy",
        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
        "//tensorflow/python/eager:context",
        "//tensorflow/python/keras",
    ],
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@ -36,15 +36,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test


-# TODO(rchao): Merge parameter_server_strategy_with_two_gpus into
-# third_party/tensorflow/python/distribute/strategy_combinations.py
-# pylint: disable=g-long-lambda
-parameter_server_strategy_with_two_gpus = combinations.NamedDistribution(
-    'ParameterServer2GPUs',
-    lambda: parameter_server_strategy.ParameterServerStrategy(),
-    required_gpus=2)
-
-
 def get_model():
  x = keras.layers.Input(shape=(3,), name='input')
  y = keras.layers.Dense(4, name='dense')(x)
@ -57,7 +48,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
  @combinations.generate(
      combinations.combine(
          distribution=[
-              parameter_server_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
          ],
          mode=['graph', 'eager']))
  def testKerasOptimizerWithUnequalInput(self, distribution):
@ -114,7 +105,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
  @combinations.generate(
      combinations.combine(
          distribution=[
-              parameter_server_strategy_with_two_gpus,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
          ],
          mode=['graph', 'eager']))
  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@ -34,9 +34,11 @@ from tensorflow.python.distribute import collective_all_reduce_strategy as colle
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
@ -51,6 +53,23 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest


+# TODO(b/130375202): remove this class which is a temporary solution before we
+# get rid of configure method.
+class ParameterServerStrategy(distribute_lib.Strategy):
+  """Temporarily mock the original strategy to bypass cluster_spec check."""
+
+  def __init__(self, cluster_resolver=None):
+    """Initializes this strategy."""
+    # The `cluster_resolver` must be set so that
+    # `ParameterServerStrategyExtended` will keep num_gpus for `configure`
+    # method.
+    if cluster_resolver is None:
+      cluster_resolver = TFConfigClusterResolver()
+    extended = parameter_server_strategy.ParameterServerStrategyExtended(
+        self, cluster_resolver=cluster_resolver)
+    super(ParameterServerStrategy, self).__init__(extended)
+
+
 def _mnist_synthetic_dataset(batch_size, steps_per_epoch):
  # train dataset
  x_train = array_ops.ones([batch_size * steps_per_epoch, 28, 28, 1],
@ -301,7 +320,7 @@ class KerasMultiWorkerTestStandaloneClient(test.TestCase,
          mode=['graph'],
          strategy_cls=[
              mirrored_strategy.MirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
+              ParameterServerStrategy,
              collective_strategy.CollectiveAllReduceStrategy,
          ],
          required_gpus=[0, 1]))
@ -383,7 +402,7 @@ class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
  @combinations.generate(
      combinations.combine(
          mode=['graph'],
-          strategy_cls=[parameter_server_strategy.ParameterServerStrategy],
+          strategy_cls=[ParameterServerStrategy],
          required_gpus=[0, 1]))
  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
    num_workers = 2
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@ -0,0 +1,67 @@
+path: "tensorflow.distribute.experimental.CentralStorageStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.central_storage_strategy.CentralStorageStrategyV1\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compute_devices\', \'parameter_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@ -14,7 +14,7 @@ tf_class {
  }
  member_method {
    name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
  }
  member_method {
    name: "colocate_vars_with"
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@ -1,5 +1,9 @@
 path: "tensorflow.distribute.experimental"
 tf_module {
+  member {
+    name: "CentralStorageStrategy"
+    mtype: "<type \'type\'>"
+  }
  member {
    name: "CollectiveCommunication"
    mtype: "<class \'enum.EnumMeta\'>"
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.CentralStorageStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.central_storage_strategy.CentralStorageStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compute_devices\', \'parameter_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_local_results"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "experimental_run_v2"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@ -13,7 +13,7 @@ tf_class {
  }
  member_method {
    name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
  }
  member_method {
    name: "colocate_vars_with"
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@ -1,5 +1,9 @@
 path: "tensorflow.distribute.experimental"
 tf_module {
+  member {
+    name: "CentralStorageStrategy"
+    mtype: "<type \'type\'>"
+  }
  member {
    name: "CollectiveCommunication"
    mtype: "<class \'enum.EnumMeta\'>"