Add get_tpu_system_metadata API to TPUClusterResolver. Also export tf.tpu.experimental.TPUSystemMetadata and tf.tpu.experimental.Topology symbols.

PiperOrigin-RevId: 301721761
Change-Id: I765e04f0e8cb3e2f556b3486d6ee692dcb0456ac
This commit is contained in:
Ruoxin Sang 2020-03-18 19:43:20 -07:00 committed by TensorFlower Gardener
parent 60fcd57c93
commit a90d94d4fc
22 changed files with 257 additions and 40 deletions

View File

@ -164,7 +164,6 @@ py_library(
"//tensorflow/python:util",
"//tensorflow/python:variable_scope",
"//tensorflow/python/data",
"//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
"//tensorflow/python/ops/losses",
"//tensorflow/python/ops/losses:loss_reduction",
"//tensorflow/tools/docs:doc_controls",

View File

@ -67,6 +67,7 @@ py_library(
deps = [
":base_cluster_resolver_py",
"//tensorflow/python:training_server_lib",
"//tensorflow/python/tpu:tpu_lib",
"//tensorflow/python/tpu/client",
] + tf_additional_rpc_deps(),
)

View File

@ -20,9 +20,9 @@ from __future__ import print_function
from tensorflow.python import framework
from tensorflow.python.client import session
from tensorflow.python.distribute.cluster_resolver import ClusterResolver
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
from tensorflow.python.eager.context import LogicalDevice
from tensorflow.python.framework import test_util
from tensorflow.python.platform import test

View File

@ -18,8 +18,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.distribute.cluster_resolver import GCEClusterResolver
from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
from tensorflow.python.platform import test
from tensorflow.python.training import server_lib

View File

@ -18,7 +18,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.distribute.cluster_resolver import KubernetesClusterResolver
from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
from tensorflow.python.platform import test
from tensorflow.python.training import server_lib

View File

@ -20,8 +20,9 @@ from __future__ import print_function
import os
from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_hostlist, expand_tasks_per_node
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_hostlist
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_tasks_per_node
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
from tensorflow.python.platform import test
from tensorflow.python.training import server_lib

View File

@ -22,7 +22,7 @@ import os
from tensorflow.python import framework
from tensorflow.python.client import session
from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
from tensorflow.python.eager.context import LogicalDevice
from tensorflow.python.framework import test_util
from tensorflow.python.platform import test

View File

@ -24,6 +24,7 @@ import re
from tensorflow.python.distribute.cluster_resolver import cluster_resolver
from tensorflow.python.framework import errors
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
from tensorflow.python.training import server_lib
from tensorflow.python.util import compat
from tensorflow.python.util.tf_export import tf_export
@ -219,6 +220,16 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
def get_job_name(self):
return self.task_type
def get_tpu_system_metadata(self):
"""Retrieves TPU system metadata given a TPUClusterResolver."""
cluster_spec = self.cluster_spec()
cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
tpu_system_metadata = (
tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access
self.master(),
cluster_def=cluster_def,
query_topology=False))
def cluster_spec(self):
"""Returns a ClusterSpec object based on the latest TPU information.

View File

@ -53,29 +53,12 @@ from tensorflow.python.ops import resource_variable_ops
from tensorflow.python.tpu import device_assignment as device_assignment_lib # pylint: disable=unused-import
from tensorflow.python.tpu import tpu
from tensorflow.python.tpu import tpu_strategy_util
from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
from tensorflow.python.tpu import training_loop
from tensorflow.python.tpu.ops import tpu_ops
from tensorflow.python.util import nest
from tensorflow.python.util.tf_export import tf_export
def get_tpu_system_metadata(tpu_cluster_resolver):
"""Retrieves TPU system metadata given a TPUClusterResolver."""
master = tpu_cluster_resolver.master()
# pylint: disable=protected-access
cluster_spec = tpu_cluster_resolver.cluster_spec()
cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
tpu_system_metadata = (
tpu_system_metadata_lib._query_tpu_system_metadata(
master,
cluster_def=cluster_def,
query_topology=False))
return tpu_system_metadata
@contextlib.contextmanager
def maybe_init_scope():
if ops.executing_eagerly_outside_functions():
@ -287,7 +270,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
self._tpu_function_cache = weakref.WeakKeyDictionary()
self._tpu_cluster_resolver = tpu_cluster_resolver
self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata()
self._device_assignment = device_assignment
tpu_devices_flat = [

View File

@ -843,6 +843,7 @@ py_library(
":context",
"//tensorflow/core:protos_all_py",
"//tensorflow/python:platform",
"//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
],
)

View File

@ -25,7 +25,7 @@ import numpy as np
import six
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
from tensorflow.python.eager import context
from tensorflow.python.eager import def_function
from tensorflow.python.eager import remote

View File

@ -22,6 +22,7 @@ import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.core.protobuf.tpu import topology_pb2
from tensorflow.python.util.tf_export import tf_export
def _tpu_device_name(job, task, device):
@ -40,6 +41,7 @@ def _tpu_host_device_name(job, task):
return "/job:%s/task:%d/device:CPU:0" % (job, task)
@tf_export("tpu.experimental.Topology")
class Topology(object):
"""Describes a set of TPU devices.

View File

@ -20,7 +20,7 @@ from __future__ import print_function
from tensorflow.core.protobuf import config_pb2
from tensorflow.python.client import session as session_lib
from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
from tensorflow.python.eager import context
from tensorflow.python.eager import function
from tensorflow.python.framework import device

View File

@ -30,6 +30,7 @@ from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.tpu import tpu
from tensorflow.python.util.tf_export import tf_export
_PINGING_MASTER_TIMEOUT_IN_MS = 5 * 60 * 1000 # 10 min
_RETRY_TIMES = 12 * 24 # 1 day
@ -39,15 +40,33 @@ _DEFAULT_JOB_NAME = 'tpu_worker'
_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
_LOCAL_MASTERS = ('', 'local')
# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
# including num_cores and num_hosts.
_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
'num_cores',
'num_hosts',
'num_of_cores_per_host',
'topology',
'devices',
])
@tf_export('tpu.experimental.TPUSystemMetadata')
class TPUSystemMetadata(
collections.namedtuple('TPUSystemMetadata', [
'num_cores',
'num_hosts',
'num_of_cores_per_host',
'topology',
'devices',
])):
"""Describes some metadata about the TPU system.
Attributes:
num_cores: interger. Total number of TPU cores in the TPU system.
num_hosts: interger. Total number of hosts (TPU workers) in the TPU system.
num_of_cores_per_host: interger. Number of TPU cores per host (TPU worker).
topology: an instance of `tf.tpu.experimental.Topology`, which describes the
physical topology of TPU system.
devices: a tuple of strings, which describes all the TPU devices in the
system.
"""
def __new__(cls, num_cores, num_hosts, num_of_cores_per_host, topology,
devices):
return super(TPUSystemMetadata,
cls).__new__(cls, num_cores, num_hosts, num_of_cores_per_host,
topology, devices)
def _query_tpu_system_metadata(master_address, cluster_def=None,
@ -129,7 +148,7 @@ def _query_tpu_system_metadata(master_address, cluster_def=None,
spec.device_index)
devices = tuple(sorted(devices, key=_sort_key))
metadata = _TPUSystemMetadata(
metadata = TPUSystemMetadata(
num_cores=tpu_core_count,
num_hosts=len(device_dict),
num_of_cores_per_host=num_of_cores_per_host,

View File

@ -23,6 +23,10 @@ tf_class {
name: "get_master"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "get_tpu_system_metadata"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "master"
argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "

View File

@ -0,0 +1,35 @@
path: "tensorflow.tpu.experimental.TPUSystemMetadata"
tf_class {
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
is_instance: "<type \'tuple\'>"
member {
name: "devices"
mtype: "<type \'property\'>"
}
member {
name: "num_cores"
mtype: "<type \'property\'>"
}
member {
name: "num_hosts"
mtype: "<type \'property\'>"
}
member {
name: "num_of_cores_per_host"
mtype: "<type \'property\'>"
}
member {
name: "topology"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
}
member_method {
name: "count"
}
member_method {
name: "index"
}
}

View File

@ -0,0 +1,53 @@
path: "tensorflow.tpu.experimental.Topology"
tf_class {
is_instance: "<class \'tensorflow.python.tpu.topology.Topology\'>"
is_instance: "<type \'object\'>"
member {
name: "device_coordinates"
mtype: "<type \'property\'>"
}
member {
name: "mesh_rank"
mtype: "<type \'property\'>"
}
member {
name: "mesh_shape"
mtype: "<type \'property\'>"
}
member {
name: "missing_devices"
mtype: "<type \'property\'>"
}
member {
name: "num_tasks"
mtype: "<type \'property\'>"
}
member {
name: "num_tpus_per_task"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
argspec: "args=[\'self\', \'serialized\', \'mesh_shape\', \'device_coordinates\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
}
member_method {
name: "cpu_device_name_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
}
member_method {
name: "serialized"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "task_ordinal_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "tpu_device_name_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
}
member_method {
name: "tpu_device_ordinal_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
}
}

View File

@ -20,6 +20,14 @@ tf_module {
name: "StochasticGradientDescentParameters"
mtype: "<type \'type\'>"
}
member {
name: "TPUSystemMetadata"
mtype: "<type \'type\'>"
}
member {
name: "Topology"
mtype: "<type \'type\'>"
}
member_method {
name: "embedding_column"
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "

View File

@ -23,6 +23,10 @@ tf_class {
name: "get_master"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "get_tpu_system_metadata"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "master"
argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "

View File

@ -0,0 +1,35 @@
path: "tensorflow.tpu.experimental.TPUSystemMetadata"
tf_class {
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
is_instance: "<type \'tuple\'>"
member {
name: "devices"
mtype: "<type \'property\'>"
}
member {
name: "num_cores"
mtype: "<type \'property\'>"
}
member {
name: "num_hosts"
mtype: "<type \'property\'>"
}
member {
name: "num_of_cores_per_host"
mtype: "<type \'property\'>"
}
member {
name: "topology"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
}
member_method {
name: "count"
}
member_method {
name: "index"
}
}

View File

@ -0,0 +1,53 @@
path: "tensorflow.tpu.experimental.Topology"
tf_class {
is_instance: "<class \'tensorflow.python.tpu.topology.Topology\'>"
is_instance: "<type \'object\'>"
member {
name: "device_coordinates"
mtype: "<type \'property\'>"
}
member {
name: "mesh_rank"
mtype: "<type \'property\'>"
}
member {
name: "mesh_shape"
mtype: "<type \'property\'>"
}
member {
name: "missing_devices"
mtype: "<type \'property\'>"
}
member {
name: "num_tasks"
mtype: "<type \'property\'>"
}
member {
name: "num_tpus_per_task"
mtype: "<type \'property\'>"
}
member_method {
name: "__init__"
argspec: "args=[\'self\', \'serialized\', \'mesh_shape\', \'device_coordinates\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
}
member_method {
name: "cpu_device_name_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
}
member_method {
name: "serialized"
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "task_ordinal_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
}
member_method {
name: "tpu_device_name_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
}
member_method {
name: "tpu_device_ordinal_at_coordinates"
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
}
}

View File

@ -4,6 +4,14 @@ tf_module {
name: "DeviceAssignment"
mtype: "<type \'type\'>"
}
member {
name: "TPUSystemMetadata"
mtype: "<type \'type\'>"
}
member {
name: "Topology"
mtype: "<type \'type\'>"
}
member_method {
name: "initialize_tpu_system"
argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "