Add get_tpu_system_metadata
API to TPUClusterResolver. Also export tf.tpu.experimental.TPUSystemMetadata
and tf.tpu.experimental.Topology
symbols.
PiperOrigin-RevId: 301721761 Change-Id: I765e04f0e8cb3e2f556b3486d6ee692dcb0456ac
This commit is contained in:
parent
60fcd57c93
commit
a90d94d4fc
@ -164,7 +164,6 @@ py_library(
|
||||
"//tensorflow/python:util",
|
||||
"//tensorflow/python:variable_scope",
|
||||
"//tensorflow/python/data",
|
||||
"//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
|
||||
"//tensorflow/python/ops/losses",
|
||||
"//tensorflow/python/ops/losses:loss_reduction",
|
||||
"//tensorflow/tools/docs:doc_controls",
|
||||
|
@ -67,6 +67,7 @@ py_library(
|
||||
deps = [
|
||||
":base_cluster_resolver_py",
|
||||
"//tensorflow/python:training_server_lib",
|
||||
"//tensorflow/python/tpu:tpu_lib",
|
||||
"//tensorflow/python/tpu/client",
|
||||
] + tf_additional_rpc_deps(),
|
||||
)
|
||||
|
@ -20,9 +20,9 @@ from __future__ import print_function
|
||||
|
||||
from tensorflow.python import framework
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.distribute.cluster_resolver import ClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
|
||||
from tensorflow.python.eager.context import LogicalDevice
|
||||
from tensorflow.python.framework import test_util
|
||||
from tensorflow.python.platform import test
|
||||
|
@ -18,8 +18,8 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.distribute.cluster_resolver import GCEClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.training import server_lib
|
||||
|
||||
|
@ -18,7 +18,7 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python.distribute.cluster_resolver import KubernetesClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.training import server_lib
|
||||
|
||||
|
@ -20,8 +20,9 @@ from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_hostlist, expand_tasks_per_node
|
||||
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_hostlist
|
||||
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import expand_tasks_per_node
|
||||
from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
|
||||
from tensorflow.python.platform import test
|
||||
from tensorflow.python.training import server_lib
|
||||
|
||||
|
@ -22,7 +22,7 @@ import os
|
||||
|
||||
from tensorflow.python import framework
|
||||
from tensorflow.python.client import session
|
||||
from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
|
||||
from tensorflow.python.eager.context import LogicalDevice
|
||||
from tensorflow.python.framework import test_util
|
||||
from tensorflow.python.platform import test
|
||||
|
@ -24,6 +24,7 @@ import re
|
||||
from tensorflow.python.distribute.cluster_resolver import cluster_resolver
|
||||
from tensorflow.python.framework import errors
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
|
||||
from tensorflow.python.training import server_lib
|
||||
from tensorflow.python.util import compat
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
@ -219,6 +220,16 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
|
||||
def get_job_name(self):
|
||||
return self.task_type
|
||||
|
||||
def get_tpu_system_metadata(self):
|
||||
"""Retrieves TPU system metadata given a TPUClusterResolver."""
|
||||
cluster_spec = self.cluster_spec()
|
||||
cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
|
||||
tpu_system_metadata = (
|
||||
tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access
|
||||
self.master(),
|
||||
cluster_def=cluster_def,
|
||||
query_topology=False))
|
||||
|
||||
def cluster_spec(self):
|
||||
"""Returns a ClusterSpec object based on the latest TPU information.
|
||||
|
||||
|
@ -53,29 +53,12 @@ from tensorflow.python.ops import resource_variable_ops
|
||||
from tensorflow.python.tpu import device_assignment as device_assignment_lib # pylint: disable=unused-import
|
||||
from tensorflow.python.tpu import tpu
|
||||
from tensorflow.python.tpu import tpu_strategy_util
|
||||
from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
|
||||
from tensorflow.python.tpu import training_loop
|
||||
from tensorflow.python.tpu.ops import tpu_ops
|
||||
from tensorflow.python.util import nest
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
|
||||
def get_tpu_system_metadata(tpu_cluster_resolver):
|
||||
"""Retrieves TPU system metadata given a TPUClusterResolver."""
|
||||
master = tpu_cluster_resolver.master()
|
||||
|
||||
# pylint: disable=protected-access
|
||||
cluster_spec = tpu_cluster_resolver.cluster_spec()
|
||||
cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
|
||||
tpu_system_metadata = (
|
||||
tpu_system_metadata_lib._query_tpu_system_metadata(
|
||||
master,
|
||||
cluster_def=cluster_def,
|
||||
query_topology=False))
|
||||
|
||||
return tpu_system_metadata
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def maybe_init_scope():
|
||||
if ops.executing_eagerly_outside_functions():
|
||||
@ -287,7 +270,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
|
||||
|
||||
self._tpu_function_cache = weakref.WeakKeyDictionary()
|
||||
self._tpu_cluster_resolver = tpu_cluster_resolver
|
||||
self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
|
||||
self._tpu_metadata = self._tpu_cluster_resolver.get_tpu_system_metadata()
|
||||
self._device_assignment = device_assignment
|
||||
|
||||
tpu_devices_flat = [
|
||||
|
@ -843,6 +843,7 @@ py_library(
|
||||
":context",
|
||||
"//tensorflow/core:protos_all_py",
|
||||
"//tensorflow/python:platform",
|
||||
"//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -25,7 +25,7 @@ import numpy as np
|
||||
import six
|
||||
|
||||
from tensorflow.python.data.ops import dataset_ops
|
||||
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.eager import def_function
|
||||
from tensorflow.python.eager import remote
|
||||
|
@ -22,6 +22,7 @@ import numpy as np
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
|
||||
from tensorflow.core.protobuf.tpu import topology_pb2
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
|
||||
def _tpu_device_name(job, task, device):
|
||||
@ -40,6 +41,7 @@ def _tpu_host_device_name(job, task):
|
||||
return "/job:%s/task:%d/device:CPU:0" % (job, task)
|
||||
|
||||
|
||||
@tf_export("tpu.experimental.Topology")
|
||||
class Topology(object):
|
||||
"""Describes a set of TPU devices.
|
||||
|
||||
|
@ -20,7 +20,7 @@ from __future__ import print_function
|
||||
|
||||
from tensorflow.core.protobuf import config_pb2
|
||||
from tensorflow.python.client import session as session_lib
|
||||
from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
|
||||
from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.eager import function
|
||||
from tensorflow.python.framework import device
|
||||
|
@ -30,6 +30,7 @@ from tensorflow.python.framework import errors
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.tpu import tpu
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
_PINGING_MASTER_TIMEOUT_IN_MS = 5 * 60 * 1000 # 10 min
|
||||
_RETRY_TIMES = 12 * 24 # 1 day
|
||||
@ -39,15 +40,33 @@ _DEFAULT_JOB_NAME = 'tpu_worker'
|
||||
_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
|
||||
_LOCAL_MASTERS = ('', 'local')
|
||||
|
||||
# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
|
||||
# including num_cores and num_hosts.
|
||||
_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
|
||||
'num_cores',
|
||||
'num_hosts',
|
||||
'num_of_cores_per_host',
|
||||
'topology',
|
||||
'devices',
|
||||
])
|
||||
|
||||
@tf_export('tpu.experimental.TPUSystemMetadata')
|
||||
class TPUSystemMetadata(
|
||||
collections.namedtuple('TPUSystemMetadata', [
|
||||
'num_cores',
|
||||
'num_hosts',
|
||||
'num_of_cores_per_host',
|
||||
'topology',
|
||||
'devices',
|
||||
])):
|
||||
"""Describes some metadata about the TPU system.
|
||||
|
||||
Attributes:
|
||||
num_cores: interger. Total number of TPU cores in the TPU system.
|
||||
num_hosts: interger. Total number of hosts (TPU workers) in the TPU system.
|
||||
num_of_cores_per_host: interger. Number of TPU cores per host (TPU worker).
|
||||
topology: an instance of `tf.tpu.experimental.Topology`, which describes the
|
||||
physical topology of TPU system.
|
||||
devices: a tuple of strings, which describes all the TPU devices in the
|
||||
system.
|
||||
"""
|
||||
|
||||
def __new__(cls, num_cores, num_hosts, num_of_cores_per_host, topology,
|
||||
devices):
|
||||
return super(TPUSystemMetadata,
|
||||
cls).__new__(cls, num_cores, num_hosts, num_of_cores_per_host,
|
||||
topology, devices)
|
||||
|
||||
|
||||
def _query_tpu_system_metadata(master_address, cluster_def=None,
|
||||
@ -129,7 +148,7 @@ def _query_tpu_system_metadata(master_address, cluster_def=None,
|
||||
spec.device_index)
|
||||
devices = tuple(sorted(devices, key=_sort_key))
|
||||
|
||||
metadata = _TPUSystemMetadata(
|
||||
metadata = TPUSystemMetadata(
|
||||
num_cores=tpu_core_count,
|
||||
num_hosts=len(device_dict),
|
||||
num_of_cores_per_host=num_of_cores_per_host,
|
||||
|
@ -23,6 +23,10 @@ tf_class {
|
||||
name: "get_master"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "get_tpu_system_metadata"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "master"
|
||||
argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
|
||||
|
@ -0,0 +1,35 @@
|
||||
path: "tensorflow.tpu.experimental.TPUSystemMetadata"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
|
||||
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
|
||||
is_instance: "<type \'tuple\'>"
|
||||
member {
|
||||
name: "devices"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_cores"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_hosts"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_of_cores_per_host"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "topology"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
}
|
||||
member_method {
|
||||
name: "count"
|
||||
}
|
||||
member_method {
|
||||
name: "index"
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
path: "tensorflow.tpu.experimental.Topology"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.tpu.topology.Topology\'>"
|
||||
is_instance: "<type \'object\'>"
|
||||
member {
|
||||
name: "device_coordinates"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "mesh_rank"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "mesh_shape"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "missing_devices"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_tasks"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_tpus_per_task"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\', \'serialized\', \'mesh_shape\', \'device_coordinates\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "cpu_device_name_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "serialized"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "task_ordinal_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "tpu_device_name_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "tpu_device_ordinal_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
}
|
@ -20,6 +20,14 @@ tf_module {
|
||||
name: "StochasticGradientDescentParameters"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "TPUSystemMetadata"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "Topology"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "embedding_column"
|
||||
argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "
|
||||
|
@ -23,6 +23,10 @@ tf_class {
|
||||
name: "get_master"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "get_tpu_system_metadata"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "master"
|
||||
argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
|
||||
|
@ -0,0 +1,35 @@
|
||||
path: "tensorflow.tpu.experimental.TPUSystemMetadata"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
|
||||
is_instance: "<class \'tensorflow.python.tpu.tpu_system_metadata.TPUSystemMetadata\'>"
|
||||
is_instance: "<type \'tuple\'>"
|
||||
member {
|
||||
name: "devices"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_cores"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_hosts"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_of_cores_per_host"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "topology"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
}
|
||||
member_method {
|
||||
name: "count"
|
||||
}
|
||||
member_method {
|
||||
name: "index"
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
path: "tensorflow.tpu.experimental.Topology"
|
||||
tf_class {
|
||||
is_instance: "<class \'tensorflow.python.tpu.topology.Topology\'>"
|
||||
is_instance: "<type \'object\'>"
|
||||
member {
|
||||
name: "device_coordinates"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "mesh_rank"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "mesh_shape"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "missing_devices"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_tasks"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member {
|
||||
name: "num_tpus_per_task"
|
||||
mtype: "<type \'property\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "__init__"
|
||||
argspec: "args=[\'self\', \'serialized\', \'mesh_shape\', \'device_coordinates\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "cpu_device_name_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "serialized"
|
||||
argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "task_ordinal_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
member_method {
|
||||
name: "tpu_device_name_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\', \'job\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "tpu_device_ordinal_at_coordinates"
|
||||
argspec: "args=[\'self\', \'device_coordinates\'], varargs=None, keywords=None, defaults=None"
|
||||
}
|
||||
}
|
@ -4,6 +4,14 @@ tf_module {
|
||||
name: "DeviceAssignment"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "TPUSystemMetadata"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member {
|
||||
name: "Topology"
|
||||
mtype: "<type \'type\'>"
|
||||
}
|
||||
member_method {
|
||||
name: "initialize_tpu_system"
|
||||
argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "
|
||||
|
Loading…
Reference in New Issue
Block a user