Split ParameterServerStrategy into multi-worker and local version.
PiperOrigin-RevId: 243347187
This commit is contained in:
parent
f47745d60a
commit
fec0d5fd20
@ -56,10 +56,12 @@ cuda_py_test(
|
||||
name = "parameter_server_strategy_test",
|
||||
srcs = ["parameter_server_strategy_test.py"],
|
||||
additional_deps = [
|
||||
":parameter_server_strategy",
|
||||
"//tensorflow/python/distribute:central_storage_strategy",
|
||||
"//tensorflow/python/distribute:combinations",
|
||||
"//tensorflow/python/distribute:parameter_server_strategy",
|
||||
"//tensorflow/python/distribute:strategy_combinations",
|
||||
"//tensorflow/python/distribute:multi_worker_test_base",
|
||||
":parameter_server_strategy",
|
||||
"//tensorflow/python/distribute:strategy_test_lib",
|
||||
"@absl_py//absl/testing:parameterized",
|
||||
"//tensorflow/core:protos_all_py",
|
||||
|
@ -24,9 +24,9 @@ from absl.testing import parameterized
|
||||
from tensorflow.contrib.distribute.python import parameter_server_strategy
|
||||
from tensorflow.core.protobuf import config_pb2
|
||||
from tensorflow.python.data.ops import dataset_ops
|
||||
from tensorflow.python.distribute import central_storage_strategy
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import device_util
|
||||
from tensorflow.python.distribute import distribute_lib
|
||||
from tensorflow.python.distribute import distribution_strategy_context as ds_context
|
||||
from tensorflow.python.distribute import multi_worker_test_base
|
||||
from tensorflow.python.distribute import multi_worker_util
|
||||
@ -52,7 +52,6 @@ from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.training import training_util
|
||||
from tensorflow.python.training.server_lib import ClusterSpec
|
||||
|
||||
CHIEF = run_config.TaskType.CHIEF
|
||||
WORKER = run_config.TaskType.WORKER
|
||||
@ -66,15 +65,6 @@ def _get_replica_id_integer():
|
||||
return replica_id
|
||||
|
||||
|
||||
class MockCoreParameterServerStrategy(distribute_lib.StrategyV1):
|
||||
"""Mock the strategy to allow cluster resolver as an argument."""
|
||||
|
||||
def __init__(self, cluster_resolver):
|
||||
super(MockCoreParameterServerStrategy, self).__init__(
|
||||
core_parameter_server_strategy.ParameterServerStrategyExtended(
|
||||
self, cluster_resolver=cluster_resolver))
|
||||
|
||||
|
||||
def create_test_objects(cluster_spec=None,
|
||||
task_type=None,
|
||||
task_id=None,
|
||||
@ -91,13 +81,15 @@ def create_test_objects(cluster_spec=None,
|
||||
task_type=task_type,
|
||||
task_id=task_id,
|
||||
num_accelerators={'GPU': num_gpus})
|
||||
distribution = core_parameter_server_strategy.ParameterServerStrategy(
|
||||
cluster_resolver)
|
||||
target = 'grpc://' + cluster_spec[WORKER][task_id]
|
||||
else:
|
||||
cluster_resolver = SimpleClusterResolver(
|
||||
ClusterSpec({}), num_accelerators={'GPU': num_gpus})
|
||||
distribution = (
|
||||
central_storage_strategy.CentralStorageStrategy._from_num_gpus(
|
||||
num_gpus))
|
||||
target = ''
|
||||
|
||||
distribution = MockCoreParameterServerStrategy(cluster_resolver)
|
||||
sess_config = copy.deepcopy(sess_config)
|
||||
sess_config = distribution.update_config_proto(sess_config)
|
||||
else:
|
||||
@ -440,7 +432,8 @@ class ParameterServerStrategyTestBase(
|
||||
x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
|
||||
train_op = d.group(train_op)
|
||||
|
||||
if context.num_gpus() < d.extended._num_gpus_per_worker:
|
||||
if context.num_gpus() < sum(
|
||||
1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
|
||||
return True
|
||||
|
||||
if task_id == 0:
|
||||
@ -536,7 +529,8 @@ class ParameterServerStrategyTestBase(
|
||||
|
||||
before_out, after_out = step()
|
||||
|
||||
if context.num_gpus() < d.extended._num_gpus_per_worker:
|
||||
if context.num_gpus() < sum(
|
||||
1 for d in d.extended.worker_devices if 'GPU' in d.upper()):
|
||||
return True
|
||||
|
||||
if (not task_type or
|
||||
@ -778,9 +772,11 @@ class ParameterServerStrategyTest(
|
||||
combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
|
||||
def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
|
||||
strategy, _, _ = create_test_objects(
|
||||
num_gpus=2, use_core_strategy=use_core_strategy)
|
||||
strategy.configure(
|
||||
cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
|
||||
cluster_spec=self._cluster_spec,
|
||||
task_type='worker',
|
||||
task_id=1,
|
||||
num_gpus=2,
|
||||
use_core_strategy=use_core_strategy)
|
||||
|
||||
config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
|
||||
|
||||
@ -923,8 +919,8 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
|
||||
strategy.extended.call_for_each_replica(f)
|
||||
|
||||
|
||||
class LocalParameterServerStrategyTest(strategy_test_lib.DistributionTestBase,
|
||||
parameterized.TestCase):
|
||||
class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase,
|
||||
parameterized.TestCase):
|
||||
|
||||
@combinations.generate(combinations.combine(mode=['graph', 'eager'],
|
||||
use_core_strategy=[True, False],
|
||||
|
@ -276,6 +276,18 @@ py_library(
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "central_storage_strategy",
|
||||
srcs = ["central_storage_strategy.py"],
|
||||
visibility = ["//tensorflow:internal"],
|
||||
deps = [
|
||||
":device_util",
|
||||
":distribute_lib",
|
||||
":parameter_server_strategy",
|
||||
"//tensorflow/python:util",
|
||||
],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "one_device_strategy",
|
||||
srcs = ["one_device_strategy.py"],
|
||||
@ -531,11 +543,11 @@ py_library(
|
||||
srcs = ["strategy_combinations.py"],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
":central_storage_strategy",
|
||||
":combinations",
|
||||
":distribute_lib",
|
||||
":mirrored_strategy",
|
||||
":one_device_strategy",
|
||||
":parameter_server_strategy",
|
||||
":tpu_strategy",
|
||||
"//tensorflow/python:framework_ops",
|
||||
"//tensorflow/python:training",
|
||||
|
66
tensorflow/python/distribute/central_storage_strategy.py
Normal file
66
tensorflow/python/distribute/central_storage_strategy.py
Normal file
@ -0,0 +1,66 @@
|
||||
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Classes implementing a multi-worker ps DistributionStrategy."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.distribute import device_util
|
||||
from tensorflow.python.distribute import distribute_lib
|
||||
from tensorflow.python.distribute import parameter_server_strategy
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
|
||||
@tf_export("distribute.experimental.CentralStorageStrategy", v1=[])
|
||||
class CentralStorageStrategy(distribute_lib.Strategy):
|
||||
"""A one-machine strategy that puts all variables on a single device.
|
||||
|
||||
Variables are assigned to local CPU or the only GPU. If there is more
|
||||
than one GPU, compute operations (other than variable update operations)
|
||||
will be replicated across all GPUs.
|
||||
|
||||
Args:
|
||||
compute_devices: an optional list of strings for device to replicate models
|
||||
on. If this is not provided, all local GPUs will be used; if there is no
|
||||
GPU, local CPU will be used.
|
||||
parameter_device: an optional device string for which device to put
|
||||
variables on. The default one is CPU or GPU if there is only one.
|
||||
"""
|
||||
|
||||
def __init__(self, compute_devices=None, parameter_device=None):
|
||||
extended = parameter_server_strategy.ParameterServerStrategyExtended(
|
||||
self,
|
||||
compute_devices=compute_devices,
|
||||
parameter_device=parameter_device)
|
||||
super(CentralStorageStrategy, self).__init__(extended)
|
||||
|
||||
@classmethod
|
||||
def _from_num_gpus(cls, num_gpus):
|
||||
return cls(device_util.local_devices_from_num_gpus(num_gpus))
|
||||
|
||||
|
||||
@tf_export(v1=["distribute.experimental.CentralStorageStrategy"])
|
||||
class CentralStorageStrategyV1(distribute_lib.StrategyV1):
|
||||
|
||||
__doc__ = CentralStorageStrategy.__doc__
|
||||
|
||||
def __init__(self, compute_devices=None, parameter_device=None):
|
||||
"""Initializes this strategy with default TFConfigClusterResolver."""
|
||||
super(CentralStorageStrategyV1, self).__init__(
|
||||
parameter_server_strategy.ParameterServerStrategyExtended(
|
||||
self,
|
||||
compute_devices=compute_devices,
|
||||
parameter_device=parameter_device))
|
@ -108,3 +108,9 @@ def get_host_for_device(device):
|
||||
return tf_device.DeviceSpec(
|
||||
job=spec.job, replica=spec.replica, task=spec.task,
|
||||
device_type="CPU", device_index=0).to_string()
|
||||
|
||||
|
||||
def local_devices_from_num_gpus(num_gpus):
|
||||
"""Returns device strings for local GPUs or CPU."""
|
||||
return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
|
||||
("/device:CPU:0",))
|
||||
|
@ -13,6 +13,7 @@ py_library(
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
deps = [
|
||||
"//tensorflow/python/distribute:central_storage_strategy",
|
||||
"//tensorflow/python/distribute:collective_all_reduce_strategy",
|
||||
"//tensorflow/python/distribute:parameter_server_strategy",
|
||||
"//tensorflow/python/distribute:tpu_strategy",
|
||||
|
@ -19,6 +19,7 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
# pylint: disable=unused-import
|
||||
from tensorflow.python.distribute import central_storage_strategy
|
||||
from tensorflow.python.distribute import collective_all_reduce_strategy
|
||||
from tensorflow.python.distribute import parameter_server_strategy
|
||||
from tensorflow.python.distribute import tpu_strategy
|
||||
|
@ -407,8 +407,7 @@ def _infer_num_gpus_per_worker(devices):
|
||||
def all_local_devices(num_gpus=None):
|
||||
if num_gpus is None:
|
||||
num_gpus = context.num_gpus()
|
||||
return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
|
||||
("/device:CPU:0",))
|
||||
return device_util.local_devices_from_num_gpus(num_gpus)
|
||||
|
||||
|
||||
def _all_devices():
|
||||
|
@ -43,32 +43,32 @@ from tensorflow.python.util import nest
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
_LOCAL_CPU = "/device:CPU:0"
|
||||
_LOCAL_GPU_0 = "/device:GPU:0"
|
||||
|
||||
|
||||
# TODO(yuefengz): maybe cache variables on local CPU.
|
||||
@tf_export("distribute.experimental.ParameterServerStrategy", v1=[])
|
||||
class ParameterServerStrategy(distribute_lib.Strategy):
|
||||
"""A parameter server DistributionStrategy.
|
||||
"""An asynchronous multi-worker parameter server DistributionStrategy.
|
||||
|
||||
This strategy requires two jobs: workers and parameter servers. Variables and
|
||||
updates to those variables will be assigned to parameter servers and other
|
||||
operations are assigned to workers.
|
||||
|
||||
This strategy class works for both local training and between-graph replicated
|
||||
training for multiple workers. It uses `TFConfigClusterResolver` to detect
|
||||
configurations for multi-worker training. In multi-worker training mode, i.e.
|
||||
`TFConfigClusterResolver` has detected 'TF_CONFIG' environment variable and
|
||||
'TF_CONFIG' has a cluster spec, variables and updates to those variables are
|
||||
assigned to parameter servers and other operations are assigned to workers.
|
||||
In local training mode, variables are assigned to local CPU or the only GPU.
|
||||
When each worker has more than one GPU, operations will be replicated on these
|
||||
GPUs. In both cases, operations are replicated but variables are not and these
|
||||
workers share a common view for which parameter server a variable is assigned
|
||||
GPUs. Even though operations may be replicated, variables are not and each
|
||||
worker shares a common view for which parameter server a variable is assigned
|
||||
to.
|
||||
|
||||
This class assumes between-graph replication will be used and works on a graph
|
||||
for a particular worker. Note that each graph and worker is independent.
|
||||
This means that while each worker will synchronously compute a single gradient
|
||||
update across all GPUs, updates between workers proceed asynchronously.
|
||||
Operations that occur only on the first replica (such as incrementing the
|
||||
global step), will occur on the first replica *of every worker*.
|
||||
By default it uses `TFConfigClusterResolver` to detect configurations for
|
||||
multi-worker training. This requires a 'TF_CONFIG' environment variable and
|
||||
the 'TF_CONFIG' must have a cluster spec.
|
||||
|
||||
This class assumes each worker is running the same code independently, but
|
||||
parameter servers are running a standard server. This means that while each
|
||||
worker will synchronously compute a single gradient update across all GPUs,
|
||||
updates between workers proceed asynchronously. Operations that occur only on
|
||||
the first replica (such as incrementing the global step), will occur on the
|
||||
first replica *of every worker*.
|
||||
|
||||
It is expected to call `call_for_each_replica(fn, ...)` for any
|
||||
operations which potentially can be replicated across replicas (i.e. multiple
|
||||
@ -86,10 +86,21 @@ class ParameterServerStrategy(distribute_lib.Strategy):
|
||||
possibly create conflicts of device assignment.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes this strategy with default TFConfigClusterResolver."""
|
||||
super(ParameterServerStrategy, self).__init__(
|
||||
ParameterServerStrategyExtended(self))
|
||||
def __init__(self, cluster_resolver=None):
|
||||
"""Initializes this strategy.
|
||||
|
||||
Args:
|
||||
cluster_resolver: Optional
|
||||
`tf.distribute.cluster_resolver.ClusterResolver` object. Defaults to a
|
||||
`tf.distribute.cluster_resolver.TFConfigClusterResolver`.
|
||||
"""
|
||||
if cluster_resolver is None:
|
||||
cluster_resolver = TFConfigClusterResolver()
|
||||
if not cluster_resolver.cluster_spec():
|
||||
raise ValueError("Cluster spec must be non-empty in `cluster_resolver`.")
|
||||
extended = ParameterServerStrategyExtended(
|
||||
self, cluster_resolver=cluster_resolver)
|
||||
super(ParameterServerStrategy, self).__init__(extended)
|
||||
|
||||
|
||||
@tf_export(v1=["distribute.experimental.ParameterServerStrategy"])
|
||||
@ -97,31 +108,41 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1):
|
||||
|
||||
__doc__ = ParameterServerStrategy.__doc__
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes this strategy with default TFConfigClusterResolver."""
|
||||
def __init__(self, cluster_resolver=None):
|
||||
"""Initializes this strategy."""
|
||||
super(ParameterServerStrategyV1, self).__init__(
|
||||
ParameterServerStrategyExtended(self))
|
||||
ParameterServerStrategyExtended(
|
||||
self, cluster_resolver=cluster_resolver))
|
||||
|
||||
|
||||
# TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1.
|
||||
class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
|
||||
"""Implementation of ParameterServerStrategy."""
|
||||
"""Implementation of ParameterServerStrategy and CentralStorageStrategy."""
|
||||
|
||||
def __init__(self,
|
||||
container_strategy,
|
||||
cluster_resolver=TFConfigClusterResolver()):
|
||||
cluster_resolver=None,
|
||||
compute_devices=None,
|
||||
parameter_device=None):
|
||||
super(ParameterServerStrategyExtended, self).__init__(container_strategy)
|
||||
self._initialize_strategy(cluster_resolver)
|
||||
self._initialize_strategy(
|
||||
cluster_resolver=cluster_resolver,
|
||||
compute_devices=compute_devices,
|
||||
parameter_device=parameter_device)
|
||||
|
||||
# We typically don't need to do all-reduce in this strategy.
|
||||
self._cross_device_ops = (
|
||||
cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
|
||||
|
||||
def _initialize_strategy(self, cluster_resolver):
|
||||
if cluster_resolver.cluster_spec().as_dict():
|
||||
def _initialize_strategy(self,
|
||||
cluster_resolver=None,
|
||||
compute_devices=None,
|
||||
parameter_device=None):
|
||||
if cluster_resolver and cluster_resolver.cluster_spec():
|
||||
self._initialize_multi_worker(cluster_resolver)
|
||||
else:
|
||||
self._initialize_local(cluster_resolver)
|
||||
self._initialize_local(
|
||||
compute_devices, parameter_device, cluster_resolver=cluster_resolver)
|
||||
|
||||
def _initialize_multi_worker(self, cluster_resolver):
|
||||
"""Initialize devices for multiple workers.
|
||||
@ -214,43 +235,41 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
|
||||
num_ps_replicas, self._is_chief, self._device_map,
|
||||
self._variable_device)
|
||||
|
||||
def _initialize_local(self, cluster_resolver):
|
||||
# TODO(yuefengz): get rid of cluster_resolver argument when contrib's
|
||||
# version no longer depends on this class.
|
||||
def _initialize_local(self,
|
||||
compute_devices,
|
||||
parameter_device,
|
||||
cluster_resolver=None):
|
||||
"""Initialize internal devices for local training."""
|
||||
worker_device = device_util.canonicalize("/device:CPU:0")
|
||||
self._input_host_device = numpy_dataset.SingleDevice(worker_device)
|
||||
|
||||
# TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in
|
||||
# some cases.
|
||||
if isinstance(cluster_resolver, TFConfigClusterResolver):
|
||||
num_gpus = context.num_gpus()
|
||||
else:
|
||||
num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
|
||||
if compute_devices is None:
|
||||
if not cluster_resolver:
|
||||
num_gpus = context.num_gpus()
|
||||
else:
|
||||
num_gpus = cluster_resolver.num_accelerators().get("GPU", 0)
|
||||
# Save the num_gpus_per_worker for configure method which is used by the
|
||||
# contrib version.
|
||||
self._num_gpus_per_worker = num_gpus
|
||||
|
||||
# Save the num_gpus_per_worker for configure method.
|
||||
self._num_gpus_per_worker = num_gpus
|
||||
compute_devices = device_util.local_devices_from_num_gpus(num_gpus)
|
||||
|
||||
# Define compute devices which is a list of device strings and one for each
|
||||
# replica. When there are GPUs, replicate operations on these GPUs.
|
||||
# Otherwise, place operations on CPU.
|
||||
if num_gpus > 0:
|
||||
compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
|
||||
else:
|
||||
compute_devices = (_LOCAL_CPU,)
|
||||
if parameter_device is None:
|
||||
# If there is only one GPU, put everything on that GPU. Otherwise, place
|
||||
# variables on CPU.
|
||||
if len(compute_devices) == 1:
|
||||
parameter_device = compute_devices[0]
|
||||
else:
|
||||
parameter_device = _LOCAL_CPU
|
||||
|
||||
self._device_map = values.ReplicaDeviceMap(compute_devices)
|
||||
self._input_workers = input_lib.InputWorkers(
|
||||
self._device_map, [(worker_device, compute_devices)])
|
||||
|
||||
# If there is only one GPU, put everything on that GPU. Otherwise, place
|
||||
# variables on CPU.
|
||||
if num_gpus == 1:
|
||||
assert len(compute_devices) == 1
|
||||
self._variable_device = _LOCAL_GPU_0
|
||||
self._parameter_devices = (_LOCAL_GPU_0,)
|
||||
else:
|
||||
self._variable_device = _LOCAL_CPU
|
||||
self._parameter_devices = (_LOCAL_CPU,)
|
||||
|
||||
self._variable_device = parameter_device
|
||||
self._parameter_devices = (parameter_device,)
|
||||
self._is_chief = True
|
||||
self._cluster_spec = None
|
||||
self._task_type = None
|
||||
|
@ -17,6 +17,7 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from tensorflow.python.distribute import central_storage_strategy
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import distribution_strategy_context
|
||||
from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
|
||||
@ -100,6 +101,10 @@ mirrored_strategy_with_two_gpus = combinations.NamedDistribution(
|
||||
"Mirrored2GPUs",
|
||||
lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
|
||||
required_gpus=2)
|
||||
central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
|
||||
"CentralStorage2GPUs",
|
||||
lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2), # pylint: disable=protected-access
|
||||
required_gpus=2)
|
||||
|
||||
gradient_descent_optimizer_v1_fn = combinations.NamedObject(
|
||||
"GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
|
||||
|
@ -23,7 +23,6 @@ from absl.testing import parameterized
|
||||
from tensorflow.core.protobuf import config_pb2
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import device_util
|
||||
from tensorflow.python.distribute import parameter_server_strategy
|
||||
from tensorflow.python.distribute import strategy_combinations
|
||||
from tensorflow.python.distribute import values
|
||||
from tensorflow.python.eager import context
|
||||
@ -40,16 +39,6 @@ from tensorflow.python.training import saver as saver_lib
|
||||
from tensorflow.python.training.tracking import util as trackable_utils
|
||||
|
||||
|
||||
# TODO(rchao): Merge parameter_server_strategy_with_two_gpus into
|
||||
# third_party/tensorflow/python/distribute/strategy_combinations.py
|
||||
# pylint: disable=g-long-lambda
|
||||
parameter_server_strategy_with_two_gpus = combinations.NamedDistribution(
|
||||
"ParameterServer2GPUs",
|
||||
lambda: parameter_server_strategy.ParameterServerStrategy(
|
||||
num_gpus_per_worker=2),
|
||||
required_gpus=2)
|
||||
|
||||
|
||||
class DistributedValuesTest(test.TestCase):
|
||||
|
||||
def testGetEager(self):
|
||||
@ -561,7 +550,9 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
|
||||
|
||||
@combinations.generate(
|
||||
combinations.combine(
|
||||
distribution=[parameter_server_strategy_with_two_gpus],
|
||||
distribution=[
|
||||
strategy_combinations.central_storage_strategy_with_two_gpus
|
||||
],
|
||||
mode=["graph", "eager"]))
|
||||
def testAssignOutOfScope_aggregating(self, distribution):
|
||||
with distribution.scope():
|
||||
@ -577,7 +568,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
|
||||
strategy_combinations.mirrored_strategy_with_one_cpu,
|
||||
strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
|
||||
strategy_combinations.tpu_strategy,
|
||||
parameter_server_strategy_with_two_gpus,
|
||||
strategy_combinations.central_storage_strategy_with_two_gpus,
|
||||
],
|
||||
mode=["graph", "eager"]))
|
||||
def testExtendsVariable(self, distribution):
|
||||
@ -591,7 +582,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
|
||||
strategy_combinations.mirrored_strategy_with_one_cpu,
|
||||
strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
|
||||
strategy_combinations.tpu_strategy,
|
||||
parameter_server_strategy_with_two_gpus,
|
||||
strategy_combinations.central_storage_strategy_with_two_gpus,
|
||||
],
|
||||
mode=["graph", "eager"]))
|
||||
def testCheckpointing(self, distribution):
|
||||
|
@ -318,10 +318,12 @@ py_library(
|
||||
"//tensorflow/python/distribute:combinations",
|
||||
"//tensorflow/python/distribute:distribute_config",
|
||||
"//tensorflow/python/distribute:distribute_coordinator",
|
||||
"//tensorflow/python/distribute:distribute_lib",
|
||||
"//tensorflow/python/distribute:mirrored_strategy",
|
||||
"//tensorflow/python/distribute:multi_worker_test_base",
|
||||
"//tensorflow/python/distribute:parameter_server_strategy",
|
||||
"//tensorflow/python/distribute:strategy_combinations",
|
||||
"//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
|
||||
"//tensorflow/python/eager:context",
|
||||
"//tensorflow/python/keras",
|
||||
],
|
||||
|
@ -23,7 +23,7 @@ import numpy as np
|
||||
from tensorflow.python import keras
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import distribution_strategy_context as ds_context
|
||||
from tensorflow.python.distribute import parameter_server_strategy
|
||||
from tensorflow.python.distribute import strategy_combinations
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import constant_op
|
||||
from tensorflow.python.framework import dtypes
|
||||
@ -36,15 +36,6 @@ from tensorflow.python.ops import variables
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
# TODO(rchao): Merge parameter_server_strategy_with_two_gpus into
|
||||
# third_party/tensorflow/python/distribute/strategy_combinations.py
|
||||
# pylint: disable=g-long-lambda
|
||||
parameter_server_strategy_with_two_gpus = combinations.NamedDistribution(
|
||||
'ParameterServer2GPUs',
|
||||
lambda: parameter_server_strategy.ParameterServerStrategy(),
|
||||
required_gpus=2)
|
||||
|
||||
|
||||
def get_model():
|
||||
x = keras.layers.Input(shape=(3,), name='input')
|
||||
y = keras.layers.Dense(4, name='dense')(x)
|
||||
@ -57,7 +48,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
|
||||
@combinations.generate(
|
||||
combinations.combine(
|
||||
distribution=[
|
||||
parameter_server_strategy_with_two_gpus,
|
||||
strategy_combinations.central_storage_strategy_with_two_gpus,
|
||||
],
|
||||
mode=['graph', 'eager']))
|
||||
def testKerasOptimizerWithUnequalInput(self, distribution):
|
||||
@ -114,7 +105,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
|
||||
@combinations.generate(
|
||||
combinations.combine(
|
||||
distribution=[
|
||||
parameter_server_strategy_with_two_gpus,
|
||||
strategy_combinations.central_storage_strategy_with_two_gpus,
|
||||
],
|
||||
mode=['graph', 'eager']))
|
||||
def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
|
||||
|
@ -34,9 +34,11 @@ from tensorflow.python.distribute import collective_all_reduce_strategy as colle
|
||||
from tensorflow.python.distribute import combinations
|
||||
from tensorflow.python.distribute import distribute_coordinator as dc
|
||||
from tensorflow.python.distribute import distribute_coordinator_context as dc_context
|
||||
from tensorflow.python.distribute import distribute_lib
|
||||
from tensorflow.python.distribute import mirrored_strategy
|
||||
from tensorflow.python.distribute import multi_worker_test_base as test_base
|
||||
from tensorflow.python.distribute import parameter_server_strategy
|
||||
from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.keras import backend
|
||||
@ -51,6 +53,23 @@ from tensorflow.python.platform import test
|
||||
from tensorflow.python.util import nest
|
||||
|
||||
|
||||
# TODO(b/130375202): remove this class which is a temporary solution before we
|
||||
# get rid of configure method.
|
||||
class ParameterServerStrategy(distribute_lib.Strategy):
|
||||
"""Temporarily mock the original strategy to bypass cluster_spec check."""
|
||||
|
||||
def __init__(self, cluster_resolver=None):
|
||||
"""Initializes this strategy."""
|
||||
# The `cluster_resolver` must be set so that
|
||||
# `ParameterServerStrategyExtended` will keep num_gpus for `configure`
|
||||
# method.
|
||||
if cluster_resolver is None:
|
||||
cluster_resolver = TFConfigClusterResolver()
|
||||
extended = parameter_server_strategy.ParameterServerStrategyExtended(
|
||||
self, cluster_resolver=cluster_resolver)
|
||||
super(ParameterServerStrategy, self).__init__(extended)
|
||||
|
||||
|
||||
def _mnist_synthetic_dataset(batch_size, steps_per_epoch):
|
||||
# train dataset
|
||||
x_train = array_ops.ones([batch_size * steps_per_epoch, 28, 28, 1],
|
||||
@ -301,7 +320,7 @@ class KerasMultiWorkerTestStandaloneClient(test.TestCase,
|
||||
mode=['graph'],
|
||||
strategy_cls=[
|
||||
mirrored_strategy.MirroredStrategy,
|
||||
parameter_server_strategy.ParameterServerStrategy,
|
||||
ParameterServerStrategy,
|
||||
collective_strategy.CollectiveAllReduceStrategy,
|
||||
],
|
||||
required_gpus=[0, 1]))
|
||||
@ -383,7 +402,7 @@ class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
|
||||
@combinations.generate(
|
||||
combinations.combine(
|
||||
mode=['graph'],
|
||||
strategy_cls=[parameter_server_strategy.ParameterServerStrategy],
|
||||
strategy_cls=[ParameterServerStrategy],
|
||||
required_gpus=[0, 1]))
|
||||
def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
|
||||
num_workers = 2
|
||||
|
@ -0,0 +1,67 @@
|
||||
path: "tensorflow.distribute.experimental.CentralStorageStrategy"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.distribute.central_storage_strategy.CentralStorageStrategyV1\'>"
|
||||
is_instance: "<class \'tensorflow.python.distribute.distribute_lib.StrategyV1\'>"
|
||||
is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
|
||||
is_instance: "<type \'object\'>"
|
||||
member {
|
||||
name: "extended"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_replicas_in_sync"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\', \'compute_devices\', \'parameter_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "colocate_vars_with"
|
||||
argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "configure"
|
||||
argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_local_results"
|
||||
argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_run"
|
||||
argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_run_v2"
|
||||
argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "group"
|
||||
argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "make_dataset_iterator"
|
||||
argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "make_input_fn_iterator"
|
||||
argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "reduce"
|
||||
argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "scope"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "unwrap"
|
||||
argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "update_config_proto"
|
||||
argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
}
|
@ -14,7 +14,7 @@ tf_class {
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
argspec: "args=[\'self\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "colocate_vars_with"
|
||||
|
@ -1,5 +1,9 @@
|
||||
path: "tensorflow.distribute.experimental"
|
||||
tf_module {
|
||||
member {
|
||||
name: "CentralStorageStrategy"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "CollectiveCommunication"
|
||||
mtype: "<class \'enum.EnumMeta\'>"
|
||||
|
@ -0,0 +1,66 @@
|
||||
path: "tensorflow.distribute.experimental.CentralStorageStrategy"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.distribute.central_storage_strategy.CentralStorageStrategy\'>"
|
||||
is_instance: "<class \'tensorflow.python.distribute.distribute_lib.Strategy\'>"
|
||||
is_instance: "<type \'object\'>"
|
||||
member {
|
||||
name: "extended"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_replicas_in_sync"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\', \'compute_devices\', \'parameter_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "colocate_vars_with"
|
||||
argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "configure"
|
||||
argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_local_results"
|
||||
argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_run"
|
||||
argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "experimental_run_v2"
|
||||
argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "group"
|
||||
argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "make_dataset_iterator"
|
||||
argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "make_input_fn_iterator"
|
||||
argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "reduce"
|
||||
argspec: "args=[\'self\', \'reduce_op\', \'value\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "scope"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "unwrap"
|
||||
argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "update_config_proto"
|
||||
argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
}
|
@ -13,7 +13,7 @@ tf_class {
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
argspec: "args=[\'self\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "colocate_vars_with"
|
||||
|
@ -1,5 +1,9 @@
|
||||
path: "tensorflow.distribute.experimental"
|
||||
tf_module {
|
||||
member {
|
||||
name: "CentralStorageStrategy"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "CollectiveCommunication"
|
||||
mtype: "<class \'enum.EnumMeta\'>"
|
||||
|
Loading…
Reference in New Issue
Block a user