From 96d0f42d1b236d21157d32805d4aa87e136083b3 Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Sun, 21 Jul 2019 19:35:10 -0700 Subject: [PATCH] Update API docs of ClusterResolver and all its implementations. PiperOrigin-RevId: 259246199 --- .../python/distribute/cluster_resolver/__init__.py | 9 ++++++++- .../distribute/cluster_resolver/cluster_resolver.py | 4 ++-- .../cluster_resolver/gce_cluster_resolver.py | 6 +++--- .../cluster_resolver/kubernetes_cluster_resolver.py | 4 ++-- .../cluster_resolver/slurm_cluster_resolver.py | 12 ++++++------ .../cluster_resolver/tfconfig_cluster_resolver.py | 7 ++++++- .../cluster_resolver/tpu_cluster_resolver.py | 4 ++-- 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py index 39ea191fb04..11de551b084 100644 --- a/tensorflow/python/distribute/cluster_resolver/__init__.py +++ b/tensorflow/python/distribute/cluster_resolver/__init__.py @@ -12,7 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Library Imports for Cluster Resolvers.""" +"""Library imports for ClusterResolvers. + + This library contains all implementations of ClusterResolvers. + ClusterResolvers are a way of specifying cluster information for distributed + execution. Built on top of existing `ClusterSpec` framework, ClusterResolvers + are a way for TensorFlow to communicate with various cluster management + systems (e.g. GCE, AWS, etc...). +""" from __future__ import absolute_import from __future__ import division diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py index c636c98254c..5b61f847801 100644 --- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py @@ -90,7 +90,7 @@ class ClusterResolver(object): @abc.abstractmethod def cluster_spec(self): - """Retrieve the current state of the cluster and returns a ClusterSpec. + """Retrieve the current state of the cluster and return a ClusterSpec. Returns: A ClusterSpec representing the state of the cluster at the moment this @@ -288,7 +288,7 @@ class UnionClusterResolver(ClusterResolver): when cluster_spec is called. The details of the merge function is documented in the cluster_spec function. - For additional Cluster Resolver properties such as task type, task index, + For additional ClusterResolver properties such as task type, task index, rpc layer, environment, etc..., we will return the value from the first ClusterResolver in the union. """ diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py index 9d7dfdd1ea9..70d42e80a70 100644 --- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of Cluster Resolvers for GCE Instance Groups.""" +"""Implementation of ClusterResolvers for GCE instance groups.""" from __future__ import absolute_import from __future__ import division @@ -33,12 +33,12 @@ except ImportError: @tf_export('distribute.cluster_resolver.GCEClusterResolver') class GCEClusterResolver(ClusterResolver): - """Cluster Resolver for Google Compute Engine. + """ClusterResolver for Google Compute Engine. This is an implementation of cluster resolvers for the Google Compute Engine instance group platform. By specifying a project, zone, and instance group, this will retrieve the IP address of all the instances within the instance - group and return a Cluster Resolver object suitable for use for distributed + group and return a ClusterResolver object suitable for use for distributed TensorFlow. """ diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py index 28b2712590d..f812df0e5c7 100644 --- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py @@ -33,7 +33,7 @@ except ImportError: @tf_export('distribute.cluster_resolver.KubernetesClusterResolver') class KubernetesClusterResolver(ClusterResolver): - """Cluster Resolver for Kubernetes. + """ClusterResolver for Kubernetes. This is an implementation of cluster resolvers for Kubernetes. When given the the Kubernetes namespace and label selector for pods, we will retrieve the @@ -48,7 +48,7 @@ class KubernetesClusterResolver(ClusterResolver): override_client=None): """Initializes a new KubernetesClusterResolver. - This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver + This initializes a new Kubernetes ClusterResolver. The ClusterResolver will attempt to talk to the Kubernetes master to retrieve all the instances of pods matching a label selector. diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py index 0e49cebee2b..1d6d346ddf2 100644 --- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py @@ -30,13 +30,13 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('distribute.cluster_resolver.SlurmClusterResolver') class SlurmClusterResolver(ClusterResolver): - """Cluster Resolver for system with Slurm workload manager. + """ClusterResolver for system with Slurm workload manager. This is an implementation of cluster resolvers for Slurm clusters. This allows the specification of jobs and task counts, number of tasks per node, number of - GPUs on each node and number of GPUs for each task, It retrieves system + GPUs on each node and number of GPUs for each task. It retrieves system attributes by Slurm environment variables, resolves allocated computing node - names, construct a cluster and return a Cluster Resolver object which an be + names, constructs a cluster and returns a ClusterResolver object which can be use for distributed TensorFlow. """ @@ -61,15 +61,15 @@ class SlurmClusterResolver(ClusterResolver): """Creates a new SlurmClusterResolver object. This takes in parameters and creates a SlurmClusterResolver object. It uses - those parameters to check which nodes will processes reside and resolves + those parameters to check which nodes will processes reside on and resolves their hostnames. With the number of the GPUs on each node and number of GPUs - for each task it offsets the port number for each processes and allocate + for each task it offsets the port number for each process and allocates GPUs to tasks by setting environment variables. The resolver currently supports homogeneous tasks and default Slurm process allocation. Args: jobs: Dictionary with job names as key and number of tasks in the job as - value + value. port_base: The first port number to start with for processes on a node. gpus_per_node: Number of GPUs available on each node. gpus_per_task: Number of GPUs to be used for each task. diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py index c9b6191a1c0..421351944c2 100644 --- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py @@ -50,7 +50,12 @@ def _get_value_in_tfconfig(key, default=None): @tf_export('distribute.cluster_resolver.TFConfigClusterResolver') class TFConfigClusterResolver(ClusterResolver): - """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar.""" + """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar. + + This is an implementation of cluster resolvers when using TF_CONFIG to set + information about the cluster. The cluster spec returned will be + initialized from the TF_CONFIG environment variable. + """ def __init__(self, task_type=None, diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py index 253708c132c..757d2a47b64 100644 --- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py @@ -94,7 +94,7 @@ class TPUClusterResolver(ClusterResolver): This works around an issue where the underlying HTTP connection sometimes times out when the script has been running for too long. Other methods in - this object calls this method to get a new API object whenever they need + this object call this method to get a new API object whenever they need to communicate with the Cloud API. Returns: @@ -206,7 +206,7 @@ class TPUClusterResolver(ClusterResolver): for the IP addresses and ports of each Cloud TPU listed. Args: - tpu: A string corresponding to the TPU to use. If the string is the empty + tpu: A string corresponding to the TPU to use. If the string is an empty string, the string 'local', or a string that begins with 'grpc://' or '/bns', then it is assumed to not correspond with a Cloud TPU and will instead be passed as the session master and no ClusterSpec propagation