Update API docs of ClusterResolver and all its implementations.
PiperOrigin-RevId: 259246199
This commit is contained in:
parent
4f73ebfcff
commit
96d0f42d1b
@ -12,7 +12,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Library Imports for Cluster Resolvers."""
|
||||
"""Library imports for ClusterResolvers.
|
||||
|
||||
This library contains all implementations of ClusterResolvers.
|
||||
ClusterResolvers are a way of specifying cluster information for distributed
|
||||
execution. Built on top of existing `ClusterSpec` framework, ClusterResolvers
|
||||
are a way for TensorFlow to communicate with various cluster management
|
||||
systems (e.g. GCE, AWS, etc...).
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
|
@ -90,7 +90,7 @@ class ClusterResolver(object):
|
||||
|
||||
@abc.abstractmethod
|
||||
def cluster_spec(self):
|
||||
"""Retrieve the current state of the cluster and returns a ClusterSpec.
|
||||
"""Retrieve the current state of the cluster and return a ClusterSpec.
|
||||
|
||||
Returns:
|
||||
A ClusterSpec representing the state of the cluster at the moment this
|
||||
@ -288,7 +288,7 @@ class UnionClusterResolver(ClusterResolver):
|
||||
when cluster_spec is called. The details of the merge function is
|
||||
documented in the cluster_spec function.
|
||||
|
||||
For additional Cluster Resolver properties such as task type, task index,
|
||||
For additional ClusterResolver properties such as task type, task index,
|
||||
rpc layer, environment, etc..., we will return the value from the first
|
||||
ClusterResolver in the union.
|
||||
"""
|
||||
|
@ -12,7 +12,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""Implementation of Cluster Resolvers for GCE Instance Groups."""
|
||||
"""Implementation of ClusterResolvers for GCE instance groups."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
@ -33,12 +33,12 @@ except ImportError:
|
||||
|
||||
@tf_export('distribute.cluster_resolver.GCEClusterResolver')
|
||||
class GCEClusterResolver(ClusterResolver):
|
||||
"""Cluster Resolver for Google Compute Engine.
|
||||
"""ClusterResolver for Google Compute Engine.
|
||||
|
||||
This is an implementation of cluster resolvers for the Google Compute Engine
|
||||
instance group platform. By specifying a project, zone, and instance group,
|
||||
this will retrieve the IP address of all the instances within the instance
|
||||
group and return a Cluster Resolver object suitable for use for distributed
|
||||
group and return a ClusterResolver object suitable for use for distributed
|
||||
TensorFlow.
|
||||
"""
|
||||
|
||||
|
@ -33,7 +33,7 @@ except ImportError:
|
||||
|
||||
@tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
|
||||
class KubernetesClusterResolver(ClusterResolver):
|
||||
"""Cluster Resolver for Kubernetes.
|
||||
"""ClusterResolver for Kubernetes.
|
||||
|
||||
This is an implementation of cluster resolvers for Kubernetes. When given the
|
||||
the Kubernetes namespace and label selector for pods, we will retrieve the
|
||||
@ -48,7 +48,7 @@ class KubernetesClusterResolver(ClusterResolver):
|
||||
override_client=None):
|
||||
"""Initializes a new KubernetesClusterResolver.
|
||||
|
||||
This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
|
||||
This initializes a new Kubernetes ClusterResolver. The ClusterResolver
|
||||
will attempt to talk to the Kubernetes master to retrieve all the instances
|
||||
of pods matching a label selector.
|
||||
|
||||
|
@ -30,13 +30,13 @@ from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
@tf_export('distribute.cluster_resolver.SlurmClusterResolver')
|
||||
class SlurmClusterResolver(ClusterResolver):
|
||||
"""Cluster Resolver for system with Slurm workload manager.
|
||||
"""ClusterResolver for system with Slurm workload manager.
|
||||
|
||||
This is an implementation of cluster resolvers for Slurm clusters. This allows
|
||||
the specification of jobs and task counts, number of tasks per node, number of
|
||||
GPUs on each node and number of GPUs for each task, It retrieves system
|
||||
GPUs on each node and number of GPUs for each task. It retrieves system
|
||||
attributes by Slurm environment variables, resolves allocated computing node
|
||||
names, construct a cluster and return a Cluster Resolver object which an be
|
||||
names, constructs a cluster and returns a ClusterResolver object which can be
|
||||
use for distributed TensorFlow.
|
||||
"""
|
||||
|
||||
@ -61,15 +61,15 @@ class SlurmClusterResolver(ClusterResolver):
|
||||
"""Creates a new SlurmClusterResolver object.
|
||||
|
||||
This takes in parameters and creates a SlurmClusterResolver object. It uses
|
||||
those parameters to check which nodes will processes reside and resolves
|
||||
those parameters to check which nodes will processes reside on and resolves
|
||||
their hostnames. With the number of the GPUs on each node and number of GPUs
|
||||
for each task it offsets the port number for each processes and allocate
|
||||
for each task it offsets the port number for each process and allocates
|
||||
GPUs to tasks by setting environment variables. The resolver currently
|
||||
supports homogeneous tasks and default Slurm process allocation.
|
||||
|
||||
Args:
|
||||
jobs: Dictionary with job names as key and number of tasks in the job as
|
||||
value
|
||||
value.
|
||||
port_base: The first port number to start with for processes on a node.
|
||||
gpus_per_node: Number of GPUs available on each node.
|
||||
gpus_per_task: Number of GPUs to be used for each task.
|
||||
|
@ -50,7 +50,12 @@ def _get_value_in_tfconfig(key, default=None):
|
||||
|
||||
@tf_export('distribute.cluster_resolver.TFConfigClusterResolver')
|
||||
class TFConfigClusterResolver(ClusterResolver):
|
||||
"""Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
|
||||
"""Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar.
|
||||
|
||||
This is an implementation of cluster resolvers when using TF_CONFIG to set
|
||||
information about the cluster. The cluster spec returned will be
|
||||
initialized from the TF_CONFIG environment variable.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
task_type=None,
|
||||
|
@ -94,7 +94,7 @@ class TPUClusterResolver(ClusterResolver):
|
||||
|
||||
This works around an issue where the underlying HTTP connection sometimes
|
||||
times out when the script has been running for too long. Other methods in
|
||||
this object calls this method to get a new API object whenever they need
|
||||
this object call this method to get a new API object whenever they need
|
||||
to communicate with the Cloud API.
|
||||
|
||||
Returns:
|
||||
@ -206,7 +206,7 @@ class TPUClusterResolver(ClusterResolver):
|
||||
for the IP addresses and ports of each Cloud TPU listed.
|
||||
|
||||
Args:
|
||||
tpu: A string corresponding to the TPU to use. If the string is the empty
|
||||
tpu: A string corresponding to the TPU to use. If the string is an empty
|
||||
string, the string 'local', or a string that begins with 'grpc://' or
|
||||
'/bns', then it is assumed to not correspond with a Cloud TPU and will
|
||||
instead be passed as the session master and no ClusterSpec propagation
|
||||
|
Loading…
Reference in New Issue
Block a user