tfdbg CLI: add node traceback to node_info output + a bug fix
The bug was related to getting a string representation of feed_dict keys when the keys are str/unicode (in cli_shared.py: get_run_short_description()). Unit tests added. Another hitchhiking change: Add command mouse-click shortcut to help output on TensorFlow runtime errors (in cli_shared.py: get_error_intro()) Change: 143055926
This commit is contained in:
parent
2c09aaeefa
commit
88d776bfd4
@ -116,6 +116,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
|
||||
| `/inf` | Search the screen output with the regex `inf` and highlight any matches. |
|
||||
| `/` | Scroll to the next line with matches to the searched regex (if any). |
|
||||
| `ni -a hidden/Relu` | Display information about the node `hidden/Relu`, including node attributes. |
|
||||
| `ni -t hidden/Relu` | Display the stack trace of node `hidden/Relu`'s construction. |
|
||||
| `li -r hidden/Relu:0` | List the inputs to the node `hidden/Relu`, recursively—i.e., the input tree. |
|
||||
| `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
|
||||
| `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
|
||||
@ -190,7 +191,7 @@ tfdbg> /inf
|
||||
|
||||
Or, alternatively:
|
||||
|
||||
``` none
|
||||
```none
|
||||
tfdbg> /(inf|nan)
|
||||
```
|
||||
|
||||
@ -199,7 +200,7 @@ about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
|
||||
item on the top or entering the equivalent command:
|
||||
|
||||
```none
|
||||
ni cross_entropy/Log
|
||||
tfdbg> ni cross_entropy/Log
|
||||
```
|
||||
|
||||

|
||||
@ -214,14 +215,21 @@ tfdbg> pt softmax/Softmax:0
|
||||
|
||||
Examine the values in the input tensor, and search to see if there are any zeros:
|
||||
|
||||
``` none
|
||||
```none
|
||||
tfdbg> /0\.000
|
||||
```
|
||||
|
||||
Indeed, there are zeros. Now it is clear that the origin of the bad numerical
|
||||
values is the node `cross_entropy/Log` taking logs of zeros. You can go back to
|
||||
the source code in [`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py)
|
||||
and infer that the culprit line is:
|
||||
values is the node `cross_entropy/Log` taking logs of zeros. To find out the
|
||||
culprit line in the Python source code, use the `-t` flag of the `ni` command
|
||||
to show the traceback of the node's construction:
|
||||
|
||||
```none
|
||||
tfdbg> ni -t cross_entropy/Log
|
||||
```
|
||||
|
||||
From the traceback, you can see that the op is constructed at line 109 of
|
||||
[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py):
|
||||
|
||||
```python
|
||||
diff = y_ * tf.log(y)
|
||||
|
@ -185,6 +185,13 @@ class DebugAnalyzer(object):
|
||||
dest="dumps",
|
||||
action="store_true",
|
||||
help="Also list dumps available from the node.")
|
||||
ap.add_argument(
|
||||
"-t",
|
||||
"--traceback",
|
||||
dest="traceback",
|
||||
action="store_true",
|
||||
help="Also include the traceback of the node's creation "
|
||||
"(if available in Python).")
|
||||
self._arg_parsers["node_info"] = ap
|
||||
|
||||
# Parser for list_inputs.
|
||||
@ -512,9 +519,44 @@ class DebugAnalyzer(object):
|
||||
lines.extend(self._list_node_dumps(node_name))
|
||||
|
||||
output = debugger_cli_common.RichTextLines(lines)
|
||||
|
||||
if parsed.traceback:
|
||||
output.extend(self._render_node_traceback(node_name))
|
||||
|
||||
_add_main_menu(output, node_name=node_name, enable_node_info=False)
|
||||
return output
|
||||
|
||||
def _render_node_traceback(self, node_name):
|
||||
"""Render traceback of a node's creation in Python, if available.
|
||||
|
||||
Args:
|
||||
node_name: (str) name of the node.
|
||||
|
||||
Returns:
|
||||
A RichTextLines object containing the stack trace of the node's
|
||||
construction.
|
||||
"""
|
||||
|
||||
lines = ["", "", "Traceback of node construction:"]
|
||||
font_attr_segs = {len(lines) - 1: [(0, len(lines[-1]), "bold")]}
|
||||
|
||||
try:
|
||||
node_stack = self._debug_dump.node_traceback(node_name)
|
||||
for depth, (file_path, line, function_name, text) in enumerate(
|
||||
node_stack):
|
||||
lines.append("%d: %s" % (depth, file_path))
|
||||
lines.append(" Line: %d" % line)
|
||||
lines.append(" Function: %s" % function_name)
|
||||
lines.append(" Text: " + (("\"%s\"" % text) if text else "None"))
|
||||
lines.append("")
|
||||
except KeyError:
|
||||
lines.append("(Node unavailable in the loaded Python graph)")
|
||||
except LookupError:
|
||||
lines.append("(Unavailable because no Python graph has been loaded)")
|
||||
|
||||
return debugger_cli_common.RichTextLines(lines,
|
||||
font_attr_segs=font_attr_segs)
|
||||
|
||||
def list_inputs(self, args, screen_info=None):
|
||||
"""Command handler for inputs.
|
||||
|
||||
|
@ -128,7 +128,9 @@ def assert_node_attribute_lines(tst,
|
||||
recipient_op_type_node_name_pairs,
|
||||
ctrl_recipient_op_type_node_name_pairs,
|
||||
attr_key_val_pairs=None,
|
||||
num_dumped_tensors=None):
|
||||
num_dumped_tensors=None,
|
||||
show_stack_trace=False,
|
||||
stack_trace_available=False):
|
||||
"""Check RichTextLines output for node_info commands.
|
||||
|
||||
Args:
|
||||
@ -148,6 +150,9 @@ def assert_node_attribute_lines(tst,
|
||||
attr_key_val_pairs: Optional: attribute key-value pairs of the node, as a
|
||||
list of 2-tuples.
|
||||
num_dumped_tensors: Optional: number of tensor dumps from the node.
|
||||
show_stack_trace: (bool) whether the stack trace of the node's
|
||||
construction is asserted to be present.
|
||||
stack_trace_available: (bool) whether Python stack trace is available.
|
||||
"""
|
||||
|
||||
line_iter = iter(out.lines)
|
||||
@ -251,6 +256,35 @@ def assert_node_attribute_lines(tst,
|
||||
|
||||
tst.assertEqual(sorted(dump_timestamps_ms), dump_timestamps_ms)
|
||||
|
||||
if show_stack_trace:
|
||||
tst.assertEqual("", next(line_iter))
|
||||
tst.assertEqual("", next(line_iter))
|
||||
tst.assertEqual("Traceback of node construction:", next(line_iter))
|
||||
if stack_trace_available:
|
||||
try:
|
||||
depth_counter = 0
|
||||
while True:
|
||||
for i in range(5):
|
||||
line = next(line_iter)
|
||||
print(line)
|
||||
if i == 0:
|
||||
tst.assertEqual(depth_counter, int(line.split(":")[0]))
|
||||
elif i == 1:
|
||||
tst.assertStartsWith(line, " Line:")
|
||||
elif i == 2:
|
||||
tst.assertStartsWith(line, " Function:")
|
||||
elif i == 3:
|
||||
tst.assertStartsWith(line, " Text:")
|
||||
elif i == 4:
|
||||
tst.assertEqual("", line)
|
||||
|
||||
depth_counter += 1
|
||||
except StopIteration:
|
||||
tst.assertEqual(0, i)
|
||||
else:
|
||||
tst.assertEqual("(Unavailable because no Python graph has been loaded)",
|
||||
next(line_iter))
|
||||
|
||||
|
||||
def check_syntax_error_output(tst, out, command_prefix):
|
||||
"""Check RichTextLines output for valid command prefix but invalid syntax."""
|
||||
@ -336,7 +370,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
|
||||
else:
|
||||
cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
|
||||
with session.Session() as sess:
|
||||
cls._sess = session.Session()
|
||||
with cls._sess as sess:
|
||||
u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
|
||||
v_init_val = np.array([[2.0], [-1.0]])
|
||||
|
||||
@ -548,6 +583,52 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
|
||||
print_tensor_node_name=node_name,
|
||||
list_outputs_node_name=node_name)
|
||||
|
||||
def testNodeInfoShowStackTraceUnavailableIsIndicated(self):
|
||||
self._debug_dump.set_python_graph(None)
|
||||
|
||||
node_name = "simple_mul_add/matmul"
|
||||
out = self._registry.dispatch_command("node_info", ["-t", node_name])
|
||||
|
||||
assert_node_attribute_lines(
|
||||
self,
|
||||
out,
|
||||
node_name,
|
||||
"MatMul",
|
||||
self._main_device, [("Identity", "simple_mul_add/u/read"),
|
||||
("Identity", "simple_mul_add/v/read")], [],
|
||||
[("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
|
||||
show_stack_trace=True, stack_trace_available=False)
|
||||
check_main_menu(
|
||||
self,
|
||||
out,
|
||||
list_tensors_enabled=True,
|
||||
list_inputs_node_name=node_name,
|
||||
print_tensor_node_name=node_name,
|
||||
list_outputs_node_name=node_name)
|
||||
|
||||
def testNodeInfoShowStackTraceAvailableWorks(self):
|
||||
self._debug_dump.set_python_graph(self._sess.graph)
|
||||
|
||||
node_name = "simple_mul_add/matmul"
|
||||
out = self._registry.dispatch_command("node_info", ["-t", node_name])
|
||||
|
||||
assert_node_attribute_lines(
|
||||
self,
|
||||
out,
|
||||
node_name,
|
||||
"MatMul",
|
||||
self._main_device, [("Identity", "simple_mul_add/u/read"),
|
||||
("Identity", "simple_mul_add/v/read")], [],
|
||||
[("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
|
||||
show_stack_trace=True, stack_trace_available=True)
|
||||
check_main_menu(
|
||||
self,
|
||||
out,
|
||||
list_tensors_enabled=True,
|
||||
list_inputs_node_name=node_name,
|
||||
print_tensor_node_name=node_name,
|
||||
list_outputs_node_name=node_name)
|
||||
|
||||
def testNodeInfoByTensorName(self):
|
||||
node_name = "simple_mul_add/u/read"
|
||||
tensor_name = node_name + ":0"
|
||||
|
@ -352,7 +352,8 @@ def get_run_short_description(run_call_count, fetches, feed_dict):
|
||||
else:
|
||||
if len(feed_dict) == 1:
|
||||
for key in feed_dict:
|
||||
description += "1 feed (%s)" % key.name
|
||||
description += "1 feed (%s)" % (
|
||||
key if isinstance(key, six.string_types) else key.name)
|
||||
else:
|
||||
description += "%d feeds" % len(feed_dict)
|
||||
|
||||
@ -384,14 +385,19 @@ def get_error_intro(tf_error):
|
||||
intro_lines, font_attr_segs=intro_font_attr_segs)
|
||||
|
||||
out.extend(
|
||||
_recommend_command("ni %s" % op_name,
|
||||
"Inspect information about the failing op."))
|
||||
_recommend_command("ni -a -d -t %s" % op_name,
|
||||
"Inspect information about the failing op.",
|
||||
create_link=True))
|
||||
out.extend(
|
||||
_recommend_command("li -r %s" % op_name,
|
||||
"List inputs to the failing op, recursively."))
|
||||
"List inputs to the failing op, recursively.",
|
||||
create_link=True))
|
||||
|
||||
out.extend(
|
||||
_recommend_command(
|
||||
"lt", "List all tensors dumped during the failing run() call."))
|
||||
"lt",
|
||||
"List all tensors dumped during the failing run() call.",
|
||||
create_link=True))
|
||||
|
||||
more_lines = [
|
||||
"",
|
||||
|
@ -227,6 +227,16 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
|
||||
command_set.add(annot[2].content)
|
||||
self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
|
||||
|
||||
def testGetRunShortDescriptionWorksForTensorFeedKey(self):
|
||||
short_description = cli_shared.get_run_short_description(
|
||||
1, self.const_a, {self.const_a: 42.0})
|
||||
self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
|
||||
|
||||
def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
|
||||
short_description = cli_shared.get_run_short_description(
|
||||
1, self.const_a, {u"foo": 42.0})
|
||||
self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
|
||||
|
||||
|
||||
class GetErrorIntroTest(test_util.TensorFlowTestCase):
|
||||
|
||||
@ -247,14 +257,25 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
|
||||
self.assertEqual([(0, len(error_intro.lines[1]), "blink")],
|
||||
error_intro.font_attr_segs[1])
|
||||
|
||||
self.assertEqual(2, error_intro.lines[4].index("ni a/Assign"))
|
||||
self.assertEqual([(2, 13, "bold")], error_intro.font_attr_segs[4])
|
||||
self.assertEqual(2, error_intro.lines[4].index("ni -a -d -t a/Assign"))
|
||||
self.assertEqual(2, error_intro.font_attr_segs[4][0][0])
|
||||
self.assertEqual(22, error_intro.font_attr_segs[4][0][1])
|
||||
self.assertEqual("ni -a -d -t a/Assign",
|
||||
error_intro.font_attr_segs[4][0][2][0].content)
|
||||
self.assertEqual("bold", error_intro.font_attr_segs[4][0][2][1])
|
||||
|
||||
self.assertEqual(2, error_intro.lines[6].index("li -r a/Assign"))
|
||||
self.assertEqual([(2, 16, "bold")], error_intro.font_attr_segs[6])
|
||||
self.assertEqual(2, error_intro.font_attr_segs[6][0][0])
|
||||
self.assertEqual(16, error_intro.font_attr_segs[6][0][1])
|
||||
self.assertEqual("li -r a/Assign",
|
||||
error_intro.font_attr_segs[6][0][2][0].content)
|
||||
self.assertEqual("bold", error_intro.font_attr_segs[6][0][2][1])
|
||||
|
||||
self.assertEqual(2, error_intro.lines[8].index("lt"))
|
||||
self.assertEqual([(2, 4, "bold")], error_intro.font_attr_segs[8])
|
||||
self.assertEqual(2, error_intro.font_attr_segs[8][0][0])
|
||||
self.assertEqual(4, error_intro.font_attr_segs[8][0][1])
|
||||
self.assertEqual("lt", error_intro.font_attr_segs[8][0][2][0].content)
|
||||
self.assertEqual("bold", error_intro.font_attr_segs[8][0][2][1])
|
||||
|
||||
self.assertStartsWith(error_intro.lines[11], "Op name:")
|
||||
self.assertTrue(error_intro.lines[11].endswith("a/Assign"))
|
||||
|
@ -378,6 +378,8 @@ class DebugDumpDir(object):
|
||||
self._create_tensor_watch_maps()
|
||||
self._load_partition_graphs(partition_graphs, validate)
|
||||
|
||||
self._python_graph = None
|
||||
|
||||
def _load_dumps(self, dump_root):
|
||||
"""Load DebugTensorDatum instances from the dump root.
|
||||
|
||||
@ -471,6 +473,23 @@ class DebugDumpDir(object):
|
||||
self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
|
||||
self._t0)
|
||||
|
||||
def set_python_graph(self, python_graph):
|
||||
"""Provide Python `Graph` object to the wrapper.
|
||||
|
||||
Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
|
||||
is a Python object and carries additional information such as the traceback
|
||||
of nodes in the graph.
|
||||
|
||||
Args:
|
||||
python_graph: (ops.Graph) The Python Graph object.
|
||||
"""
|
||||
|
||||
self._python_graph = python_graph
|
||||
self._node_traceback = {}
|
||||
if self._python_graph:
|
||||
for op in self._python_graph.get_operations():
|
||||
self._node_traceback[op.name] = op.traceback
|
||||
|
||||
@property
|
||||
def dumped_tensor_data(self):
|
||||
return self._dump_tensor_data
|
||||
@ -1136,3 +1155,27 @@ class DebugDumpDir(object):
|
||||
watch_key)
|
||||
|
||||
return self._watch_key_to_rel_time[watch_key]
|
||||
|
||||
def node_traceback(self, element_name):
|
||||
"""Try to retrieve the Python traceback of node's construction.
|
||||
|
||||
Args:
|
||||
element_name: (str) Name of a graph element (node or tensor).
|
||||
|
||||
Returns:
|
||||
(list) The traceback list object as returned by the `extract_trace`
|
||||
method of Python's traceback module.
|
||||
|
||||
Raises:
|
||||
LookupError: If Python graph is not available for traceback lookup.
|
||||
KeyError: If the node cannot be found in the Python graph loaded.
|
||||
"""
|
||||
|
||||
if self._python_graph is None:
|
||||
raise LookupError("Python graph is not available for traceback lookup")
|
||||
|
||||
node_name = get_node_name(element_name)
|
||||
if node_name not in self._node_traceback:
|
||||
raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)
|
||||
|
||||
return self._node_traceback[node_name]
|
||||
|
@ -932,6 +932,54 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
|
||||
self.assertTrue(np.isnan(numeric_summary[10]))
|
||||
self.assertTrue(np.isnan(numeric_summary[11]))
|
||||
|
||||
def testLookUpNodePythonTracebackWorks(self):
|
||||
with session.Session() as sess:
|
||||
u_init = constant_op.constant(10.0)
|
||||
u = variables.Variable(u_init, name="traceback/u")
|
||||
v_init = constant_op.constant(20.0)
|
||||
v = variables.Variable(v_init, name="traceback/v")
|
||||
|
||||
w = math_ops.multiply(u, v, name="traceback/w")
|
||||
|
||||
sess.run(variables.global_variables_initializer())
|
||||
|
||||
run_metadata = config_pb2.RunMetadata()
|
||||
run_options = config_pb2.RunOptions(output_partition_graphs=True)
|
||||
debug_utils.watch_graph(
|
||||
run_options, sess.graph, debug_urls=self._debug_urls())
|
||||
|
||||
sess.run(w, options=run_options, run_metadata=run_metadata)
|
||||
dump = debug_data.DebugDumpDir(
|
||||
self._dump_root, partition_graphs=run_metadata.partition_graphs)
|
||||
|
||||
# Prior to setting the Python graph, attempts to do traceback lookup
|
||||
# should lead to exceptions.
|
||||
with self.assertRaisesRegexp(
|
||||
LookupError, "Python graph is not available for traceback lookup"):
|
||||
dump.node_traceback("traceback/w")
|
||||
|
||||
dump.set_python_graph(sess.graph)
|
||||
|
||||
# After setting the Python graph, attempts to look up nonexistent nodes
|
||||
# should lead to exceptions.
|
||||
with self.assertRaisesRegexp(
|
||||
KeyError, r"Cannot find node \"foo\" in Python graph"):
|
||||
dump.node_traceback("foo")
|
||||
|
||||
# Lookup should work with node name input.
|
||||
traceback = dump.node_traceback("traceback/w")
|
||||
self.assertIsInstance(traceback, list)
|
||||
self.assertGreater(len(traceback), 0)
|
||||
for trace in traceback:
|
||||
self.assertIsInstance(trace, tuple)
|
||||
|
||||
# Lookup should also work with tensor name input.
|
||||
traceback = dump.node_traceback("traceback/w:0")
|
||||
self.assertIsInstance(traceback, list)
|
||||
self.assertGreater(len(traceback), 0)
|
||||
for trace in traceback:
|
||||
self.assertIsInstance(trace, tuple)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
googletest.main()
|
||||
|
@ -257,6 +257,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
|
||||
|
||||
debug_dump = debug_data.DebugDumpDir(
|
||||
self._dump_root, partition_graphs=partition_graphs)
|
||||
debug_dump.set_python_graph(self._sess.graph)
|
||||
|
||||
passed_filter = None
|
||||
if self._active_tensor_filter:
|
||||
|
Loading…
Reference in New Issue
Block a user