tfdbg CLI: add node traceback to node_info output + a bug fix

The bug was related to getting a string representation of feed_dict keys when the keys are str/unicode (in cli_shared.py: get_run_short_description()). Unit tests added.

Another hitchhiking change:
Add command mouse-click shortcut to help output on TensorFlow runtime errors (in cli_shared.py: get_error_intro())
Change: 143055926
This commit is contained in:
Shanqing Cai 2016-12-27 14:11:18 -08:00 committed by TensorFlower Gardener
parent 2c09aaeefa
commit 88d776bfd4
8 changed files with 267 additions and 17 deletions

View File

@ -116,6 +116,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
| `/inf` | Search the screen output with the regex `inf` and highlight any matches. |
| `/` | Scroll to the next line with matches to the searched regex (if any). |
| `ni -a hidden/Relu` | Display information about the node `hidden/Relu`, including node attributes. |
| `ni -t hidden/Relu` | Display the stack trace of node `hidden/Relu`'s construction. |
| `li -r hidden/Relu:0` | List the inputs to the node `hidden/Relu`, recursively—i.e., the input tree. |
| `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
| `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
@ -190,7 +191,7 @@ tfdbg> /inf
Or, alternatively:
``` none
```none
tfdbg> /(inf|nan)
```
@ -199,7 +200,7 @@ about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
item on the top or entering the equivalent command:
```none
ni cross_entropy/Log
tfdbg> ni cross_entropy/Log
```
![tfdbg run-end UI: infs and nans](tfdbg_screenshot_run_end_node_info.png)
@ -214,14 +215,21 @@ tfdbg> pt softmax/Softmax:0
Examine the values in the input tensor, and search to see if there are any zeros:
``` none
```none
tfdbg> /0\.000
```
Indeed, there are zeros. Now it is clear that the origin of the bad numerical
values is the node `cross_entropy/Log` taking logs of zeros. You can go back to
the source code in [`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py)
and infer that the culprit line is:
values is the node `cross_entropy/Log` taking logs of zeros. To find out the
culprit line in the Python source code, use the `-t` flag of the `ni` command
to show the traceback of the node's construction:
```none
tfdbg> ni -t cross_entropy/Log
```
From the traceback, you can see that the op is constructed at line 109 of
[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py):
```python
diff = y_ * tf.log(y)

View File

@ -185,6 +185,13 @@ class DebugAnalyzer(object):
dest="dumps",
action="store_true",
help="Also list dumps available from the node.")
ap.add_argument(
"-t",
"--traceback",
dest="traceback",
action="store_true",
help="Also include the traceback of the node's creation "
"(if available in Python).")
self._arg_parsers["node_info"] = ap
# Parser for list_inputs.
@ -512,9 +519,44 @@ class DebugAnalyzer(object):
lines.extend(self._list_node_dumps(node_name))
output = debugger_cli_common.RichTextLines(lines)
if parsed.traceback:
output.extend(self._render_node_traceback(node_name))
_add_main_menu(output, node_name=node_name, enable_node_info=False)
return output
def _render_node_traceback(self, node_name):
"""Render traceback of a node's creation in Python, if available.
Args:
node_name: (str) name of the node.
Returns:
A RichTextLines object containing the stack trace of the node's
construction.
"""
lines = ["", "", "Traceback of node construction:"]
font_attr_segs = {len(lines) - 1: [(0, len(lines[-1]), "bold")]}
try:
node_stack = self._debug_dump.node_traceback(node_name)
for depth, (file_path, line, function_name, text) in enumerate(
node_stack):
lines.append("%d: %s" % (depth, file_path))
lines.append(" Line: %d" % line)
lines.append(" Function: %s" % function_name)
lines.append(" Text: " + (("\"%s\"" % text) if text else "None"))
lines.append("")
except KeyError:
lines.append("(Node unavailable in the loaded Python graph)")
except LookupError:
lines.append("(Unavailable because no Python graph has been loaded)")
return debugger_cli_common.RichTextLines(lines,
font_attr_segs=font_attr_segs)
def list_inputs(self, args, screen_info=None):
"""Command handler for inputs.

View File

@ -128,7 +128,9 @@ def assert_node_attribute_lines(tst,
recipient_op_type_node_name_pairs,
ctrl_recipient_op_type_node_name_pairs,
attr_key_val_pairs=None,
num_dumped_tensors=None):
num_dumped_tensors=None,
show_stack_trace=False,
stack_trace_available=False):
"""Check RichTextLines output for node_info commands.
Args:
@ -148,6 +150,9 @@ def assert_node_attribute_lines(tst,
attr_key_val_pairs: Optional: attribute key-value pairs of the node, as a
list of 2-tuples.
num_dumped_tensors: Optional: number of tensor dumps from the node.
show_stack_trace: (bool) whether the stack trace of the node's
construction is asserted to be present.
stack_trace_available: (bool) whether Python stack trace is available.
"""
line_iter = iter(out.lines)
@ -251,6 +256,35 @@ def assert_node_attribute_lines(tst,
tst.assertEqual(sorted(dump_timestamps_ms), dump_timestamps_ms)
if show_stack_trace:
tst.assertEqual("", next(line_iter))
tst.assertEqual("", next(line_iter))
tst.assertEqual("Traceback of node construction:", next(line_iter))
if stack_trace_available:
try:
depth_counter = 0
while True:
for i in range(5):
line = next(line_iter)
print(line)
if i == 0:
tst.assertEqual(depth_counter, int(line.split(":")[0]))
elif i == 1:
tst.assertStartsWith(line, " Line:")
elif i == 2:
tst.assertStartsWith(line, " Function:")
elif i == 3:
tst.assertStartsWith(line, " Text:")
elif i == 4:
tst.assertEqual("", line)
depth_counter += 1
except StopIteration:
tst.assertEqual(0, i)
else:
tst.assertEqual("(Unavailable because no Python graph has been loaded)",
next(line_iter))
def check_syntax_error_output(tst, out, command_prefix):
"""Check RichTextLines output for valid command prefix but invalid syntax."""
@ -336,7 +370,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
else:
cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
with session.Session() as sess:
cls._sess = session.Session()
with cls._sess as sess:
u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
v_init_val = np.array([[2.0], [-1.0]])
@ -548,6 +583,52 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
print_tensor_node_name=node_name,
list_outputs_node_name=node_name)
def testNodeInfoShowStackTraceUnavailableIsIndicated(self):
self._debug_dump.set_python_graph(None)
node_name = "simple_mul_add/matmul"
out = self._registry.dispatch_command("node_info", ["-t", node_name])
assert_node_attribute_lines(
self,
out,
node_name,
"MatMul",
self._main_device, [("Identity", "simple_mul_add/u/read"),
("Identity", "simple_mul_add/v/read")], [],
[("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
show_stack_trace=True, stack_trace_available=False)
check_main_menu(
self,
out,
list_tensors_enabled=True,
list_inputs_node_name=node_name,
print_tensor_node_name=node_name,
list_outputs_node_name=node_name)
def testNodeInfoShowStackTraceAvailableWorks(self):
self._debug_dump.set_python_graph(self._sess.graph)
node_name = "simple_mul_add/matmul"
out = self._registry.dispatch_command("node_info", ["-t", node_name])
assert_node_attribute_lines(
self,
out,
node_name,
"MatMul",
self._main_device, [("Identity", "simple_mul_add/u/read"),
("Identity", "simple_mul_add/v/read")], [],
[("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
show_stack_trace=True, stack_trace_available=True)
check_main_menu(
self,
out,
list_tensors_enabled=True,
list_inputs_node_name=node_name,
print_tensor_node_name=node_name,
list_outputs_node_name=node_name)
def testNodeInfoByTensorName(self):
node_name = "simple_mul_add/u/read"
tensor_name = node_name + ":0"

View File

@ -352,7 +352,8 @@ def get_run_short_description(run_call_count, fetches, feed_dict):
else:
if len(feed_dict) == 1:
for key in feed_dict:
description += "1 feed (%s)" % key.name
description += "1 feed (%s)" % (
key if isinstance(key, six.string_types) else key.name)
else:
description += "%d feeds" % len(feed_dict)
@ -384,14 +385,19 @@ def get_error_intro(tf_error):
intro_lines, font_attr_segs=intro_font_attr_segs)
out.extend(
_recommend_command("ni %s" % op_name,
"Inspect information about the failing op."))
_recommend_command("ni -a -d -t %s" % op_name,
"Inspect information about the failing op.",
create_link=True))
out.extend(
_recommend_command("li -r %s" % op_name,
"List inputs to the failing op, recursively."))
"List inputs to the failing op, recursively.",
create_link=True))
out.extend(
_recommend_command(
"lt", "List all tensors dumped during the failing run() call."))
"lt",
"List all tensors dumped during the failing run() call.",
create_link=True))
more_lines = [
"",

View File

@ -227,6 +227,16 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
command_set.add(annot[2].content)
self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
def testGetRunShortDescriptionWorksForTensorFeedKey(self):
short_description = cli_shared.get_run_short_description(
1, self.const_a, {self.const_a: 42.0})
self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
short_description = cli_shared.get_run_short_description(
1, self.const_a, {u"foo": 42.0})
self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
class GetErrorIntroTest(test_util.TensorFlowTestCase):
@ -247,14 +257,25 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
self.assertEqual([(0, len(error_intro.lines[1]), "blink")],
error_intro.font_attr_segs[1])
self.assertEqual(2, error_intro.lines[4].index("ni a/Assign"))
self.assertEqual([(2, 13, "bold")], error_intro.font_attr_segs[4])
self.assertEqual(2, error_intro.lines[4].index("ni -a -d -t a/Assign"))
self.assertEqual(2, error_intro.font_attr_segs[4][0][0])
self.assertEqual(22, error_intro.font_attr_segs[4][0][1])
self.assertEqual("ni -a -d -t a/Assign",
error_intro.font_attr_segs[4][0][2][0].content)
self.assertEqual("bold", error_intro.font_attr_segs[4][0][2][1])
self.assertEqual(2, error_intro.lines[6].index("li -r a/Assign"))
self.assertEqual([(2, 16, "bold")], error_intro.font_attr_segs[6])
self.assertEqual(2, error_intro.font_attr_segs[6][0][0])
self.assertEqual(16, error_intro.font_attr_segs[6][0][1])
self.assertEqual("li -r a/Assign",
error_intro.font_attr_segs[6][0][2][0].content)
self.assertEqual("bold", error_intro.font_attr_segs[6][0][2][1])
self.assertEqual(2, error_intro.lines[8].index("lt"))
self.assertEqual([(2, 4, "bold")], error_intro.font_attr_segs[8])
self.assertEqual(2, error_intro.font_attr_segs[8][0][0])
self.assertEqual(4, error_intro.font_attr_segs[8][0][1])
self.assertEqual("lt", error_intro.font_attr_segs[8][0][2][0].content)
self.assertEqual("bold", error_intro.font_attr_segs[8][0][2][1])
self.assertStartsWith(error_intro.lines[11], "Op name:")
self.assertTrue(error_intro.lines[11].endswith("a/Assign"))

View File

@ -378,6 +378,8 @@ class DebugDumpDir(object):
self._create_tensor_watch_maps()
self._load_partition_graphs(partition_graphs, validate)
self._python_graph = None
def _load_dumps(self, dump_root):
"""Load DebugTensorDatum instances from the dump root.
@ -471,6 +473,23 @@ class DebugDumpDir(object):
self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
self._t0)
def set_python_graph(self, python_graph):
"""Provide Python `Graph` object to the wrapper.
Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
is a Python object and carries additional information such as the traceback
of nodes in the graph.
Args:
python_graph: (ops.Graph) The Python Graph object.
"""
self._python_graph = python_graph
self._node_traceback = {}
if self._python_graph:
for op in self._python_graph.get_operations():
self._node_traceback[op.name] = op.traceback
@property
def dumped_tensor_data(self):
return self._dump_tensor_data
@ -1136,3 +1155,27 @@ class DebugDumpDir(object):
watch_key)
return self._watch_key_to_rel_time[watch_key]
def node_traceback(self, element_name):
"""Try to retrieve the Python traceback of node's construction.
Args:
element_name: (str) Name of a graph element (node or tensor).
Returns:
(list) The traceback list object as returned by the `extract_trace`
method of Python's traceback module.
Raises:
LookupError: If Python graph is not available for traceback lookup.
KeyError: If the node cannot be found in the Python graph loaded.
"""
if self._python_graph is None:
raise LookupError("Python graph is not available for traceback lookup")
node_name = get_node_name(element_name)
if node_name not in self._node_traceback:
raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)
return self._node_traceback[node_name]

View File

@ -932,6 +932,54 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
self.assertTrue(np.isnan(numeric_summary[10]))
self.assertTrue(np.isnan(numeric_summary[11]))
def testLookUpNodePythonTracebackWorks(self):
with session.Session() as sess:
u_init = constant_op.constant(10.0)
u = variables.Variable(u_init, name="traceback/u")
v_init = constant_op.constant(20.0)
v = variables.Variable(v_init, name="traceback/v")
w = math_ops.multiply(u, v, name="traceback/w")
sess.run(variables.global_variables_initializer())
run_metadata = config_pb2.RunMetadata()
run_options = config_pb2.RunOptions(output_partition_graphs=True)
debug_utils.watch_graph(
run_options, sess.graph, debug_urls=self._debug_urls())
sess.run(w, options=run_options, run_metadata=run_metadata)
dump = debug_data.DebugDumpDir(
self._dump_root, partition_graphs=run_metadata.partition_graphs)
# Prior to setting the Python graph, attempts to do traceback lookup
# should lead to exceptions.
with self.assertRaisesRegexp(
LookupError, "Python graph is not available for traceback lookup"):
dump.node_traceback("traceback/w")
dump.set_python_graph(sess.graph)
# After setting the Python graph, attempts to look up nonexistent nodes
# should lead to exceptions.
with self.assertRaisesRegexp(
KeyError, r"Cannot find node \"foo\" in Python graph"):
dump.node_traceback("foo")
# Lookup should work with node name input.
traceback = dump.node_traceback("traceback/w")
self.assertIsInstance(traceback, list)
self.assertGreater(len(traceback), 0)
for trace in traceback:
self.assertIsInstance(trace, tuple)
# Lookup should also work with tensor name input.
traceback = dump.node_traceback("traceback/w:0")
self.assertIsInstance(traceback, list)
self.assertGreater(len(traceback), 0)
for trace in traceback:
self.assertIsInstance(trace, tuple)
if __name__ == "__main__":
googletest.main()

View File

@ -257,6 +257,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
debug_dump = debug_data.DebugDumpDir(
self._dump_root, partition_graphs=partition_graphs)
debug_dump.set_python_graph(self._sess.graph)
passed_filter = None
if self._active_tensor_filter: