Docstring fixes for cluster resolvers.
PiperOrigin-RevId: 315426402 Change-Id: I9a8982af6a2fe0538f9af3812572db55b29525b8
This commit is contained in:
parent
33014a38d9
commit
d9e5e2f7b3
|
@ -63,7 +63,8 @@ class ClusterResolver(object):
|
|||
|
||||
This defines the skeleton for all implementations of ClusterResolvers.
|
||||
ClusterResolvers are a way for TensorFlow to communicate with various cluster
|
||||
management systems (e.g. GCE, AWS, etc...).
|
||||
management systems (e.g. GCE, AWS, etc...) and gives TensorFlow necessary
|
||||
information to set up distributed training.
|
||||
|
||||
By letting TensorFlow communicate with these systems, we will be able to
|
||||
automatically discover and resolve IP addresses for various TensorFlow
|
||||
|
@ -73,7 +74,8 @@ class ClusterResolver(object):
|
|||
Note to Implementors: In addition to these abstract methods, you must also
|
||||
implement the task_type, task_id, and rpc_layer attributes. You may choose
|
||||
to implement them either as properties with getters or setters or directly
|
||||
set the attributes.
|
||||
set the attributes. The task_type and task_id attributes are required by
|
||||
`tf.distribute.experimental.MultiWorkerMirroredStrategy`.
|
||||
|
||||
- task_type is the name of the server's current named job (e.g. 'worker',
|
||||
'ps' in a distributed parameterized training job).
|
||||
|
@ -84,11 +86,11 @@ class ClusterResolver(object):
|
|||
|
||||
@abc.abstractmethod
|
||||
def cluster_spec(self):
|
||||
"""Retrieve the current state of the cluster and return a ClusterSpec.
|
||||
"""Retrieve the current state of the cluster and return a `tf.train.ClusterSpec`.
|
||||
|
||||
Returns:
|
||||
A ClusterSpec representing the state of the cluster at the moment this
|
||||
function is called.
|
||||
A `tf.train.ClusterSpec` representing the state of the cluster at the
|
||||
moment this function is called.
|
||||
|
||||
Implementors of this function must take care in ensuring that the
|
||||
ClusterSpec returned is up-to-date at the time of calling this function.
|
||||
|
@ -102,6 +104,8 @@ class ClusterResolver(object):
|
|||
def master(self, task_type=None, task_id=None, rpc_layer=None):
|
||||
"""Retrieves the name or URL of the session master.
|
||||
|
||||
Note: this is only useful for TensorFlow 1.x.
|
||||
|
||||
Args:
|
||||
task_type: (Optional) The type of the TensorFlow task of the master.
|
||||
task_id: (Optional) The index of the TensorFlow task of the master.
|
||||
|
@ -126,7 +130,7 @@ class ClusterResolver(object):
|
|||
available per worker.
|
||||
|
||||
Optionally, we allow callers to specify the task_type, and task_id, for
|
||||
if they want to target a specific TensorFlow process to query
|
||||
if they want to target a specific TensorFlow task to query
|
||||
the number of accelerators. This is to support heterogenous environments,
|
||||
where the number of accelerators cores per host is different.
|
||||
|
||||
|
@ -142,6 +146,8 @@ class ClusterResolver(object):
|
|||
A map of accelerator types to number of cores.
|
||||
"""
|
||||
master = self.master(task_type, task_id)
|
||||
# TODO(b/126786766): in eager mode, we should check whether
|
||||
# `tf.config.experimental_connect_to_cluster` is called or not.
|
||||
devices = get_accelerator_devices(master, config_proto)
|
||||
mapping = collections.defaultdict(int)
|
||||
for device in devices:
|
||||
|
@ -174,7 +180,35 @@ class ClusterResolver(object):
|
|||
|
||||
@tf_export('distribute.cluster_resolver.SimpleClusterResolver')
|
||||
class SimpleClusterResolver(ClusterResolver):
|
||||
"""Simple implementation of ClusterResolver that accepts a ClusterSpec."""
|
||||
"""Simple implementation of ClusterResolver that accepts all attributes.
|
||||
|
||||
Please see the base class for documentation of arguments of its constructor.
|
||||
|
||||
It is useful if you want to specify some or all attributes.
|
||||
|
||||
Usage example with `tf.distribute.Strategy`:
|
||||
|
||||
```Python
|
||||
cluster = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
|
||||
"worker1.example.com:2222"]})
|
||||
|
||||
# On worker 0
|
||||
cluster_resolver = SimpleClusterResolver(cluster, task_type="worker",
|
||||
task_id=0,
|
||||
num_accelerators={"GPU": 8},
|
||||
rpc_layer="grpc")
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
|
||||
# On worker 1
|
||||
cluster_resolver = SimpleClusterResolver(cluster, task_type="worker",
|
||||
task_id=1,
|
||||
num_accelerators={"GPU": 8},
|
||||
rpc_layer="grpc")
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, cluster_spec, master='', task_type=None, task_id=None,
|
||||
environment='', num_accelerators=None,
|
||||
|
@ -190,7 +224,7 @@ class SimpleClusterResolver(ClusterResolver):
|
|||
self._rpc_layer = rpc_layer
|
||||
|
||||
if not isinstance(cluster_spec, ClusterSpec):
|
||||
raise TypeError('cluster_spec must be a ClusterSpec.')
|
||||
raise TypeError('cluster_spec must be a `tf.train.ClusterSpec`.')
|
||||
self._cluster_spec = cluster_spec
|
||||
|
||||
if not isinstance(master, str):
|
||||
|
@ -204,6 +238,8 @@ class SimpleClusterResolver(ClusterResolver):
|
|||
def master(self, task_type=None, task_id=None, rpc_layer=None):
|
||||
"""Returns the master address to use when creating a session.
|
||||
|
||||
Note: this is only useful for TensorFlow 1.x.
|
||||
|
||||
Args:
|
||||
task_type: (Optional) The type of the TensorFlow task of the master.
|
||||
task_id: (Optional) The index of the TensorFlow task of the master.
|
||||
|
@ -249,9 +285,8 @@ class SimpleClusterResolver(ClusterResolver):
|
|||
"""Returns the number of accelerator cores per worker.
|
||||
|
||||
The SimpleClusterResolver does not do automatic detection of accelerators,
|
||||
so a TensorFlow session will never be created, and thus all arguments are
|
||||
unused and we simply assume that the type of accelerator is a GPU and return
|
||||
the value in provided to us in the constructor.
|
||||
and thus all arguments are unused and we simply return the value provided
|
||||
in the constructor.
|
||||
|
||||
Args:
|
||||
task_type: Unused.
|
||||
|
@ -285,6 +320,36 @@ class UnionClusterResolver(ClusterResolver):
|
|||
For additional ClusterResolver properties such as task type, task index,
|
||||
rpc layer, environment, etc..., we will return the value from the first
|
||||
ClusterResolver in the union.
|
||||
|
||||
An example to combine two cluster resolvers:
|
||||
|
||||
```Python
|
||||
cluster_0 = tf.train.ClusterSpec({"worker": ["worker0.example.com:2222",
|
||||
"worker1.example.com:2222"]})
|
||||
cluster_resolver_0 = SimpleClusterResolver(cluster, task_type="worker",
|
||||
task_id=0,
|
||||
rpc_layer="grpc")
|
||||
|
||||
cluster_1 = tf.train.ClusterSpec({"ps": ["ps0.example.com:2222",
|
||||
"ps1.example.com:2222"]})
|
||||
cluster_resolver_1 = SimpleClusterResolver(cluster, task_type="ps",
|
||||
task_id=0,
|
||||
rpc_layer="grpc")
|
||||
|
||||
# Its task type would be "worker".
|
||||
cluster_resolver = UnionClusterResolver(cluster_resolver_0,
|
||||
cluster_resolver_1)
|
||||
```
|
||||
|
||||
An example to override the number of GPUs in a TFConfigClusterResolver
|
||||
instance:
|
||||
|
||||
```Python
|
||||
tf_config = TFConfigClusterResolver()
|
||||
gpu_override = SimpleClusterResolver(tf_config.cluster_spec(),
|
||||
num_accelerators={"GPU": 1})
|
||||
cluster_resolver = UnionResolver(gpu_override, tf_config)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
@ -400,6 +465,8 @@ class UnionClusterResolver(ClusterResolver):
|
|||
This usually returns the master from the first ClusterResolver passed in,
|
||||
but you can override this by specifying the task_type and task_id.
|
||||
|
||||
Note: this is only useful for TensorFlow 1.x.
|
||||
|
||||
Args:
|
||||
task_type: (Optional) The type of the TensorFlow task of the master.
|
||||
task_id: (Optional) The index of the TensorFlow task of the master.
|
||||
|
|
|
@ -40,6 +40,29 @@ class GCEClusterResolver(ClusterResolver):
|
|||
this will retrieve the IP address of all the instances within the instance
|
||||
group and return a ClusterResolver object suitable for use for distributed
|
||||
TensorFlow.
|
||||
|
||||
Note: this cluster resolver cannot retrieve `task_type`, `task_id` or
|
||||
`rpc_layer`. To use it with some distribution strategies like
|
||||
`tf.distribute.experimental.MultiWorkerMirroredStrategy`, you will need to
|
||||
specify `task_type` and `task_id` in the constructor.
|
||||
|
||||
Usage example with tf.distribute.Strategy:
|
||||
|
||||
```Python
|
||||
# On worker 0
|
||||
cluster_resolver = GCEClusterResolver("my-project", "us-west1",
|
||||
"my-instance-group",
|
||||
task_type="worker", task_id=0)
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
|
||||
# On worker 1
|
||||
cluster_resolver = GCEClusterResolver("my-project", "us-west1",
|
||||
"my-instance-group",
|
||||
task_type="worker", task_id=1)
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
|
|
@ -39,6 +39,31 @@ class KubernetesClusterResolver(ClusterResolver):
|
|||
the Kubernetes namespace and label selector for pods, we will retrieve the
|
||||
pod IP addresses of all running pods matching the selector, and return a
|
||||
ClusterSpec based on that information.
|
||||
|
||||
Note: it cannot retrieve `task_type`, `task_id` or `rpc_layer`. To use it
|
||||
with some distribution strategies like
|
||||
`tf.distribute.experimental.MultiWorkerMirroredStrategy`, you will need to
|
||||
specify `task_type` and `task_id` by setting these attributes.
|
||||
|
||||
Usage example with tf.distribute.Strategy:
|
||||
|
||||
```Python
|
||||
# On worker 0
|
||||
cluster_resolver = KubernetesClusterResolver(
|
||||
{"worker": ["job-name=worker-cluster-a", "job-name=worker-cluster-b"]})
|
||||
cluster_resolver.task_type = "worker"
|
||||
cluster_resolver.task_id = 0
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
|
||||
# On worker 1
|
||||
cluster_resolver = KubernetesClusterResolver(
|
||||
{"worker": ["job-name=worker-cluster-a", "job-name=worker-cluster-b"]})
|
||||
cluster_resolver.task_type = "worker"
|
||||
cluster_resolver.task_id = 1
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=cluster_resolver)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -101,6 +126,8 @@ class KubernetesClusterResolver(ClusterResolver):
|
|||
parameters when using this function. If you do both, the function parameters
|
||||
will override the object properties.
|
||||
|
||||
Note: this is only useful for TensorFlow 1.x.
|
||||
|
||||
Args:
|
||||
task_type: (Optional) The type of the TensorFlow task of the master.
|
||||
task_id: (Optional) The index of the TensorFlow task of the master.
|
||||
|
|
|
@ -55,6 +55,30 @@ class TFConfigClusterResolver(ClusterResolver):
|
|||
This is an implementation of cluster resolvers when using TF_CONFIG to set
|
||||
information about the cluster. The cluster spec returned will be
|
||||
initialized from the TF_CONFIG environment variable.
|
||||
|
||||
An example to set TF_CONFIG is:
|
||||
|
||||
```Python
|
||||
os.environ['TF_CONFIG'] = json.dumps({
|
||||
'cluster': {
|
||||
'worker': ["localhost:12345", "localhost:23456"]
|
||||
},
|
||||
'task': {'type': 'worker', 'index': 0}
|
||||
})
|
||||
```
|
||||
|
||||
However, sometimes the container orchestration framework will set TF_CONFIG
|
||||
for you. In this case, you can just create an instance without passing in any
|
||||
arguments. You can find an example here to let Kuburnetes set TF_CONFIG for
|
||||
you: https://github.com/tensorflow/ecosystem/tree/master/kubernetes. Then you
|
||||
can use it with `tf.distribute.Strategy` as:
|
||||
|
||||
```Python
|
||||
# `TFConfigClusterResolver` is already the default one in the following
|
||||
# strategy.
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
cluster_resolver=TFConfigClusterResolver())
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -140,6 +164,8 @@ class TFConfigClusterResolver(ClusterResolver):
|
|||
def master(self, task_type=None, task_id=None, rpc_layer=None):
|
||||
"""Returns the master address to use when creating a TensorFlow session.
|
||||
|
||||
Note: this is only useful for TensorFlow 1.x.
|
||||
|
||||
Args:
|
||||
task_type: (String, optional) Overrides and sets the task_type of the
|
||||
master.
|
||||
|
|
Loading…
Reference in New Issue