Update API docs of ClusterResolver and all its implementations.

PiperOrigin-RevId: 259246199
This commit is contained in:
Anjali Sridhar 2019-07-21 19:35:10 -07:00 committed by TensorFlower Gardener
parent 4f73ebfcff
commit 96d0f42d1b
7 changed files with 29 additions and 17 deletions

View File

@ -12,7 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Library Imports for Cluster Resolvers."""
"""Library imports for ClusterResolvers.
This library contains all implementations of ClusterResolvers.
ClusterResolvers are a way of specifying cluster information for distributed
execution. Built on top of existing `ClusterSpec` framework, ClusterResolvers
are a way for TensorFlow to communicate with various cluster management
systems (e.g. GCE, AWS, etc...).
"""
from __future__ import absolute_import
from __future__ import division

View File

@ -90,7 +90,7 @@ class ClusterResolver(object):
@abc.abstractmethod
def cluster_spec(self):
"""Retrieve the current state of the cluster and returns a ClusterSpec.
"""Retrieve the current state of the cluster and return a ClusterSpec.
Returns:
A ClusterSpec representing the state of the cluster at the moment this
@ -288,7 +288,7 @@ class UnionClusterResolver(ClusterResolver):
when cluster_spec is called. The details of the merge function is
documented in the cluster_spec function.
For additional Cluster Resolver properties such as task type, task index,
For additional ClusterResolver properties such as task type, task index,
rpc layer, environment, etc..., we will return the value from the first
ClusterResolver in the union.
"""

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Implementation of Cluster Resolvers for GCE Instance Groups."""
"""Implementation of ClusterResolvers for GCE instance groups."""
from __future__ import absolute_import
from __future__ import division
@ -33,12 +33,12 @@ except ImportError:
@tf_export('distribute.cluster_resolver.GCEClusterResolver')
class GCEClusterResolver(ClusterResolver):
"""Cluster Resolver for Google Compute Engine.
"""ClusterResolver for Google Compute Engine.
This is an implementation of cluster resolvers for the Google Compute Engine
instance group platform. By specifying a project, zone, and instance group,
this will retrieve the IP address of all the instances within the instance
group and return a Cluster Resolver object suitable for use for distributed
group and return a ClusterResolver object suitable for use for distributed
TensorFlow.
"""

View File

@ -33,7 +33,7 @@ except ImportError:
@tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
class KubernetesClusterResolver(ClusterResolver):
"""Cluster Resolver for Kubernetes.
"""ClusterResolver for Kubernetes.
This is an implementation of cluster resolvers for Kubernetes. When given the
the Kubernetes namespace and label selector for pods, we will retrieve the
@ -48,7 +48,7 @@ class KubernetesClusterResolver(ClusterResolver):
override_client=None):
"""Initializes a new KubernetesClusterResolver.
This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
This initializes a new Kubernetes ClusterResolver. The ClusterResolver
will attempt to talk to the Kubernetes master to retrieve all the instances
of pods matching a label selector.

View File

@ -30,13 +30,13 @@ from tensorflow.python.util.tf_export import tf_export
@tf_export('distribute.cluster_resolver.SlurmClusterResolver')
class SlurmClusterResolver(ClusterResolver):
"""Cluster Resolver for system with Slurm workload manager.
"""ClusterResolver for system with Slurm workload manager.
This is an implementation of cluster resolvers for Slurm clusters. This allows
the specification of jobs and task counts, number of tasks per node, number of
GPUs on each node and number of GPUs for each task, It retrieves system
GPUs on each node and number of GPUs for each task. It retrieves system
attributes by Slurm environment variables, resolves allocated computing node
names, construct a cluster and return a Cluster Resolver object which an be
names, constructs a cluster and returns a ClusterResolver object which can be
use for distributed TensorFlow.
"""
@ -61,15 +61,15 @@ class SlurmClusterResolver(ClusterResolver):
"""Creates a new SlurmClusterResolver object.
This takes in parameters and creates a SlurmClusterResolver object. It uses
those parameters to check which nodes will processes reside and resolves
those parameters to check which nodes will processes reside on and resolves
their hostnames. With the number of the GPUs on each node and number of GPUs
for each task it offsets the port number for each processes and allocate
for each task it offsets the port number for each process and allocates
GPUs to tasks by setting environment variables. The resolver currently
supports homogeneous tasks and default Slurm process allocation.
Args:
jobs: Dictionary with job names as key and number of tasks in the job as
value
value.
port_base: The first port number to start with for processes on a node.
gpus_per_node: Number of GPUs available on each node.
gpus_per_task: Number of GPUs to be used for each task.

View File

@ -50,7 +50,12 @@ def _get_value_in_tfconfig(key, default=None):
@tf_export('distribute.cluster_resolver.TFConfigClusterResolver')
class TFConfigClusterResolver(ClusterResolver):
"""Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
"""Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar.
This is an implementation of cluster resolvers when using TF_CONFIG to set
information about the cluster. The cluster spec returned will be
initialized from the TF_CONFIG environment variable.
"""
def __init__(self,
task_type=None,

View File

@ -94,7 +94,7 @@ class TPUClusterResolver(ClusterResolver):
This works around an issue where the underlying HTTP connection sometimes
times out when the script has been running for too long. Other methods in
this object calls this method to get a new API object whenever they need
this object call this method to get a new API object whenever they need
to communicate with the Cloud API.
Returns:
@ -206,7 +206,7 @@ class TPUClusterResolver(ClusterResolver):
for the IP addresses and ports of each Cloud TPU listed.
Args:
tpu: A string corresponding to the TPU to use. If the string is the empty
tpu: A string corresponding to the TPU to use. If the string is an empty
string, the string 'local', or a string that begins with 'grpc://' or
'/bns', then it is assumed to not correspond with a Cloud TPU and will
instead be passed as the session master and no ClusterSpec propagation