- Make tensors from Grappler-created nodes visible to tfdbg. - To this end, add a wildcard node name to the DebugTensorWatch proto. - Add unit test based on tfdbg's filesystem dump mode. - A few unit tests are updated to account for the fact that additional tensors get watched (mainly under GPU tests) with all runtime graph nodes now being watched. PiperOrigin-RevId: 260997920
291 lines
12 KiB
Python
291 lines
12 KiB
Python
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""TensorFlow Debugger (tfdbg) Utilities."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import re
|
|
|
|
from six.moves import xrange # pylint: disable=redefined-builtin
|
|
|
|
|
|
def add_debug_tensor_watch(run_options,
|
|
node_name,
|
|
output_slot=0,
|
|
debug_ops="DebugIdentity",
|
|
debug_urls=None,
|
|
tolerate_debug_op_creation_failures=False,
|
|
global_step=-1):
|
|
"""Add watch on a `Tensor` to `RunOptions`.
|
|
|
|
N.B.:
|
|
1. Under certain circumstances, the `Tensor` may not get actually watched
|
|
(e.g., if the node of the `Tensor` is constant-folded during runtime).
|
|
2. For debugging purposes, the `parallel_iteration` attribute of all
|
|
`tf.while_loop`s in the graph are set to 1 to prevent any node from
|
|
being executed multiple times concurrently. This change does not affect
|
|
subsequent non-debugged runs of the same `tf.while_loop`s.
|
|
|
|
Args:
|
|
run_options: An instance of `config_pb2.RunOptions` to be modified.
|
|
node_name: (`str`) name of the node to watch.
|
|
output_slot: (`int`) output slot index of the tensor from the watched node.
|
|
debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a
|
|
`list` of `str` or a single `str`. The latter case is equivalent to a
|
|
`list` of `str` with only one element.
|
|
For debug op types with customizable attributes, each debug op string can
|
|
optionally contain a list of attribute names, in the syntax of:
|
|
debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
|
|
debug_urls: (`str` or `list` of `str`) URL(s) to send debug values to,
|
|
e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
|
|
tolerate_debug_op_creation_failures: (`bool`) Whether to tolerate debug op
|
|
creation failures by not throwing exceptions.
|
|
global_step: (`int`) Optional global_step count for this debug tensor
|
|
watch.
|
|
"""
|
|
|
|
watch_opts = run_options.debug_options.debug_tensor_watch_opts
|
|
run_options.debug_options.global_step = global_step
|
|
|
|
watch = watch_opts.add()
|
|
watch.tolerate_debug_op_creation_failures = (
|
|
tolerate_debug_op_creation_failures)
|
|
watch.node_name = node_name
|
|
watch.output_slot = output_slot
|
|
|
|
if isinstance(debug_ops, str):
|
|
debug_ops = [debug_ops]
|
|
|
|
watch.debug_ops.extend(debug_ops)
|
|
|
|
if debug_urls:
|
|
if isinstance(debug_urls, str):
|
|
debug_urls = [debug_urls]
|
|
|
|
watch.debug_urls.extend(debug_urls)
|
|
|
|
|
|
def watch_graph(run_options,
|
|
graph,
|
|
debug_ops="DebugIdentity",
|
|
debug_urls=None,
|
|
node_name_regex_whitelist=None,
|
|
op_type_regex_whitelist=None,
|
|
tensor_dtype_regex_whitelist=None,
|
|
tolerate_debug_op_creation_failures=False,
|
|
global_step=-1,
|
|
reset_disk_byte_usage=False):
|
|
"""Add debug watches to `RunOptions` for a TensorFlow graph.
|
|
|
|
To watch all `Tensor`s on the graph, let both `node_name_regex_whitelist`
|
|
and `op_type_regex_whitelist` be the default (`None`).
|
|
|
|
N.B.:
|
|
1. Under certain circumstances, the `Tensor` may not get actually watched
|
|
(e.g., if the node of the `Tensor` is constant-folded during runtime).
|
|
2. For debugging purposes, the `parallel_iteration` attribute of all
|
|
`tf.while_loop`s in the graph are set to 1 to prevent any node from
|
|
being executed multiple times concurrently. This change does not affect
|
|
subsequent non-debugged runs of the same `tf.while_loop`s.
|
|
|
|
|
|
Args:
|
|
run_options: An instance of `config_pb2.RunOptions` to be modified.
|
|
graph: An instance of `ops.Graph`.
|
|
debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
|
|
debug_urls: URLs to send debug values to. Can be a list of strings,
|
|
a single string, or None. The case of a single string is equivalent to
|
|
a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`,
|
|
`grpc://localhost:12345`.
|
|
For debug op types with customizable attributes, each debug op name string
|
|
can optionally contain a list of attribute names, in the syntax of:
|
|
debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...)
|
|
node_name_regex_whitelist: Regular-expression whitelist for node_name,
|
|
e.g., `"(weight_[0-9]+|bias_.*)"`
|
|
op_type_regex_whitelist: Regular-expression whitelist for the op type of
|
|
nodes, e.g., `"(Variable|Add)"`.
|
|
If both `node_name_regex_whitelist` and `op_type_regex_whitelist`
|
|
are set, the two filtering operations will occur in a logical `AND`
|
|
relation. In other words, a node will be included if and only if it
|
|
hits both whitelists.
|
|
tensor_dtype_regex_whitelist: Regular-expression whitelist for Tensor
|
|
data type, e.g., `"^int.*"`.
|
|
This whitelist operates in logical `AND` relations to the two whitelists
|
|
above.
|
|
tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
|
|
failures (e.g., due to dtype incompatibility) are to be tolerated by not
|
|
throwing exceptions.
|
|
global_step: (`int`) Optional global_step count for this debug tensor
|
|
watch.
|
|
reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte
|
|
usage to zero (default: `False`).
|
|
"""
|
|
if not debug_ops:
|
|
raise ValueError("debug_ops must not be empty or None.")
|
|
if not debug_urls:
|
|
raise ValueError("debug_urls must not be empty or None.")
|
|
|
|
if isinstance(debug_ops, str):
|
|
debug_ops = [debug_ops]
|
|
|
|
node_name_pattern = (re.compile(node_name_regex_whitelist)
|
|
if node_name_regex_whitelist else None)
|
|
op_type_pattern = (re.compile(op_type_regex_whitelist)
|
|
if op_type_regex_whitelist else None)
|
|
tensor_dtype_pattern = (re.compile(tensor_dtype_regex_whitelist)
|
|
if tensor_dtype_regex_whitelist else None)
|
|
|
|
ops = graph.get_operations()
|
|
for op in ops:
|
|
# Skip nodes without any output tensors.
|
|
if not op.outputs:
|
|
continue
|
|
|
|
node_name = op.name
|
|
op_type = op.type
|
|
|
|
if node_name_pattern and not node_name_pattern.match(node_name):
|
|
continue
|
|
if op_type_pattern and not op_type_pattern.match(op_type):
|
|
continue
|
|
|
|
for slot in xrange(len(op.outputs)):
|
|
if (tensor_dtype_pattern and
|
|
not tensor_dtype_pattern.match(op.outputs[slot].dtype.name)):
|
|
continue
|
|
|
|
add_debug_tensor_watch(
|
|
run_options,
|
|
node_name,
|
|
output_slot=slot,
|
|
debug_ops=debug_ops,
|
|
debug_urls=debug_urls,
|
|
tolerate_debug_op_creation_failures=(
|
|
tolerate_debug_op_creation_failures),
|
|
global_step=global_step)
|
|
|
|
# If no filter for node or tensor is used, will add a wildcard node name, so
|
|
# that all nodes, including the ones created internally by TensorFlow itself
|
|
# (e.g., by Grappler), can be watched during debugging.
|
|
use_node_name_wildcard = (not node_name_pattern and
|
|
not op_type_pattern and
|
|
not tensor_dtype_pattern)
|
|
if use_node_name_wildcard:
|
|
add_debug_tensor_watch(
|
|
run_options,
|
|
"*",
|
|
output_slot=-1,
|
|
debug_ops=debug_ops,
|
|
debug_urls=debug_urls,
|
|
tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures,
|
|
global_step=global_step)
|
|
|
|
run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage
|
|
|
|
|
|
def watch_graph_with_blacklists(run_options,
|
|
graph,
|
|
debug_ops="DebugIdentity",
|
|
debug_urls=None,
|
|
node_name_regex_blacklist=None,
|
|
op_type_regex_blacklist=None,
|
|
tensor_dtype_regex_blacklist=None,
|
|
tolerate_debug_op_creation_failures=False,
|
|
global_step=-1,
|
|
reset_disk_byte_usage=False):
|
|
"""Add debug tensor watches, blacklisting nodes and op types.
|
|
|
|
This is similar to `watch_graph()`, but the node names and op types are
|
|
blacklisted, instead of whitelisted.
|
|
|
|
N.B.:
|
|
1. Under certain circumstances, the `Tensor` may not get actually watched
|
|
(e.g., if the node of the `Tensor` is constant-folded during runtime).
|
|
2. For debugging purposes, the `parallel_iteration` attribute of all
|
|
`tf.while_loop`s in the graph are set to 1 to prevent any node from
|
|
being executed multiple times concurrently. This change does not affect
|
|
subsequent non-debugged runs of the same `tf.while_loop`s.
|
|
|
|
Args:
|
|
run_options: An instance of `config_pb2.RunOptions` to be modified.
|
|
graph: An instance of `ops.Graph`.
|
|
debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use.
|
|
See the documentation of `watch_graph` for more details.
|
|
debug_urls: URL(s) to send debug values to, e.g.,
|
|
`file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`.
|
|
node_name_regex_blacklist: Regular-expression blacklist for node_name.
|
|
This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`.
|
|
op_type_regex_blacklist: Regular-expression blacklist for the op type of
|
|
nodes, e.g., `"(Variable|Add)"`.
|
|
If both node_name_regex_blacklist and op_type_regex_blacklist
|
|
are set, the two filtering operations will occur in a logical `OR`
|
|
relation. In other words, a node will be excluded if it hits either of
|
|
the two blacklists; a node will be included if and only if it hits
|
|
neither of the blacklists.
|
|
tensor_dtype_regex_blacklist: Regular-expression blacklist for Tensor
|
|
data type, e.g., `"^int.*"`.
|
|
This blacklist operates in logical `OR` relations to the two whitelists
|
|
above.
|
|
tolerate_debug_op_creation_failures: (`bool`) whether debug op creation
|
|
failures (e.g., due to dtype incompatibility) are to be tolerated by not
|
|
throwing exceptions.
|
|
global_step: (`int`) Optional global_step count for this debug tensor
|
|
watch.
|
|
reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte
|
|
usage to zero (default: `False`).
|
|
"""
|
|
|
|
if isinstance(debug_ops, str):
|
|
debug_ops = [debug_ops]
|
|
|
|
node_name_pattern = (re.compile(node_name_regex_blacklist) if
|
|
node_name_regex_blacklist else None)
|
|
op_type_pattern = (re.compile(op_type_regex_blacklist) if
|
|
op_type_regex_blacklist else None)
|
|
tensor_dtype_pattern = (re.compile(tensor_dtype_regex_blacklist) if
|
|
tensor_dtype_regex_blacklist else None)
|
|
|
|
ops = graph.get_operations()
|
|
for op in ops:
|
|
# Skip nodes without any output tensors.
|
|
if not op.outputs:
|
|
continue
|
|
|
|
node_name = op.name
|
|
op_type = op.type
|
|
|
|
if node_name_pattern and node_name_pattern.match(node_name):
|
|
continue
|
|
if op_type_pattern and op_type_pattern.match(op_type):
|
|
continue
|
|
|
|
for slot in xrange(len(op.outputs)):
|
|
if (tensor_dtype_pattern and
|
|
tensor_dtype_pattern.match(op.outputs[slot].dtype.name)):
|
|
continue
|
|
|
|
add_debug_tensor_watch(
|
|
run_options,
|
|
node_name,
|
|
output_slot=slot,
|
|
debug_ops=debug_ops,
|
|
debug_urls=debug_urls,
|
|
tolerate_debug_op_creation_failures=(
|
|
tolerate_debug_op_creation_failures),
|
|
global_step=global_step)
|
|
run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage
|