tfdbg CLI: add node traceback to node_info output + a bug fix

The bug was related to getting a string representation of feed_dict keys when the keys are str/unicode (in cli_shared.py: get_run_short_description()). Unit tests added. Another hitchhiking change: Add command mouse-click shortcut to help output on TensorFlow runtime errors (in cli_shared.py: get_error_intro()) Change: 143055926
2016-12-27 14:11:18 -08:00 · 2016-12-27 14:11:18 -08:00 · 88d776bfd4
commit 88d776bfd4
parent 2c09aaeefa
8 changed files with 267 additions and 17 deletions
--- a/tensorflow/g3doc/how_tos/debugger/index.md
+++ b/tensorflow/g3doc/how_tos/debugger/index.md
@ -116,6 +116,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | `/inf` | Search the screen output with the regex `inf` and highlight any matches. |
 | `/` | Scroll to the next line with matches to the searched regex (if any). |
 | `ni -a hidden/Relu` | Display information about the node `hidden/Relu`, including node attributes. |
+| `ni -t hidden/Relu` | Display the stack trace of node `hidden/Relu`'s construction. |
 | `li -r hidden/Relu:0` | List the inputs to the node `hidden/Relu`, recursively—i.e., the input tree. |
 | `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
 | `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
@ -190,7 +191,7 @@ tfdbg> /inf

 Or, alternatively:

-``` none
+```none
 tfdbg> /(inf|nan)
 ```

@ -199,7 +200,7 @@ about the node `cross_entropy/Log` by clicking the underlined `node_info` menu
 item on the top or entering the equivalent command:

 ```none
-ni cross_entropy/Log
+tfdbg> ni cross_entropy/Log
 ```

 ![tfdbg run-end UI: infs and nans](tfdbg_screenshot_run_end_node_info.png)
@ -214,14 +215,21 @@ tfdbg> pt softmax/Softmax:0

 Examine the values in the input tensor, and search to see if there are any zeros:

-``` none
+```none
 tfdbg> /0\.000
 ```

 Indeed, there are zeros. Now it is clear that the origin of the bad numerical
-values is the node `cross_entropy/Log` taking logs of zeros. You can go back to
-the source code in [`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py)
-and infer that the culprit line is:
+values is the node `cross_entropy/Log` taking logs of zeros. To find out the
+culprit line in the Python source code, use the `-t` flag of the `ni` command
+to show the traceback of the node's construction:
+
+```none
+tfdbg> ni -t cross_entropy/Log
+```
+
+From the traceback, you can see that the op is constructed at line 109 of
+[`debug_mnist.py`](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_mnist.py):

 ```python
 diff = y_ * tf.log(y)
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@ -185,6 +185,13 @@ class DebugAnalyzer(object):
        dest="dumps",
        action="store_true",
        help="Also list dumps available from the node.")
+    ap.add_argument(
+        "-t",
+        "--traceback",
+        dest="traceback",
+        action="store_true",
+        help="Also include the traceback of the node's creation "
+        "(if available in Python).")
    self._arg_parsers["node_info"] = ap

    # Parser for list_inputs.
@ -512,9 +519,44 @@ class DebugAnalyzer(object):
      lines.extend(self._list_node_dumps(node_name))

    output = debugger_cli_common.RichTextLines(lines)
+
+    if parsed.traceback:
+      output.extend(self._render_node_traceback(node_name))
+
    _add_main_menu(output, node_name=node_name, enable_node_info=False)
    return output

+  def _render_node_traceback(self, node_name):
+    """Render traceback of a node's creation in Python, if available.
+
+    Args:
+      node_name: (str) name of the node.
+
+    Returns:
+      A RichTextLines object containing the stack trace of the node's
+      construction.
+    """
+
+    lines = ["", "", "Traceback of node construction:"]
+    font_attr_segs = {len(lines) - 1: [(0, len(lines[-1]), "bold")]}
+
+    try:
+      node_stack = self._debug_dump.node_traceback(node_name)
+      for depth, (file_path, line, function_name, text) in enumerate(
+          node_stack):
+        lines.append("%d: %s" % (depth, file_path))
+        lines.append("  Line:     %d" % line)
+        lines.append("  Function: %s" % function_name)
+        lines.append("  Text:     " + (("\"%s\"" % text) if text else "None"))
+        lines.append("")
+    except KeyError:
+      lines.append("(Node unavailable in the loaded Python graph)")
+    except LookupError:
+      lines.append("(Unavailable because no Python graph has been loaded)")
+
+    return debugger_cli_common.RichTextLines(lines,
+                                             font_attr_segs=font_attr_segs)
+
  def list_inputs(self, args, screen_info=None):
    """Command handler for inputs.

--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@ -128,7 +128,9 @@ def assert_node_attribute_lines(tst,
                                recipient_op_type_node_name_pairs,
                                ctrl_recipient_op_type_node_name_pairs,
                                attr_key_val_pairs=None,
-                                num_dumped_tensors=None):
+                                num_dumped_tensors=None,
+                                show_stack_trace=False,
+                                stack_trace_available=False):
  """Check RichTextLines output for node_info commands.

  Args:
@ -148,6 +150,9 @@ def assert_node_attribute_lines(tst,
    attr_key_val_pairs: Optional: attribute key-value pairs of the node, as a
      list of 2-tuples.
    num_dumped_tensors: Optional: number of tensor dumps from the node.
+    show_stack_trace: (bool) whether the stack trace of the node's
+      construction is asserted to be present.
+    stack_trace_available: (bool) whether Python stack trace is available.
  """

  line_iter = iter(out.lines)
@ -251,6 +256,35 @@ def assert_node_attribute_lines(tst,

    tst.assertEqual(sorted(dump_timestamps_ms), dump_timestamps_ms)

+  if show_stack_trace:
+    tst.assertEqual("", next(line_iter))
+    tst.assertEqual("", next(line_iter))
+    tst.assertEqual("Traceback of node construction:", next(line_iter))
+    if stack_trace_available:
+      try:
+        depth_counter = 0
+        while True:
+          for i in range(5):
+            line = next(line_iter)
+            print(line)
+            if i == 0:
+              tst.assertEqual(depth_counter, int(line.split(":")[0]))
+            elif i == 1:
+              tst.assertStartsWith(line, "  Line:")
+            elif i == 2:
+              tst.assertStartsWith(line, "  Function:")
+            elif i == 3:
+              tst.assertStartsWith(line, "  Text:")
+            elif i == 4:
+              tst.assertEqual("", line)
+
+          depth_counter += 1
+      except StopIteration:
+        tst.assertEqual(0, i)
+    else:
+      tst.assertEqual("(Unavailable because no Python graph has been loaded)",
+                      next(line_iter))
+

 def check_syntax_error_output(tst, out, command_prefix):
  """Check RichTextLines output for valid command prefix but invalid syntax."""
@ -336,7 +370,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
    else:
      cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"

-    with session.Session() as sess:
+    cls._sess = session.Session()
+    with cls._sess as sess:
      u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
      v_init_val = np.array([[2.0], [-1.0]])

@ -548,6 +583,52 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
        print_tensor_node_name=node_name,
        list_outputs_node_name=node_name)

+  def testNodeInfoShowStackTraceUnavailableIsIndicated(self):
+    self._debug_dump.set_python_graph(None)
+
+    node_name = "simple_mul_add/matmul"
+    out = self._registry.dispatch_command("node_info", ["-t", node_name])
+
+    assert_node_attribute_lines(
+        self,
+        out,
+        node_name,
+        "MatMul",
+        self._main_device, [("Identity", "simple_mul_add/u/read"),
+                            ("Identity", "simple_mul_add/v/read")], [],
+        [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
+        show_stack_trace=True, stack_trace_available=False)
+    check_main_menu(
+        self,
+        out,
+        list_tensors_enabled=True,
+        list_inputs_node_name=node_name,
+        print_tensor_node_name=node_name,
+        list_outputs_node_name=node_name)
+
+  def testNodeInfoShowStackTraceAvailableWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+
+    node_name = "simple_mul_add/matmul"
+    out = self._registry.dispatch_command("node_info", ["-t", node_name])
+
+    assert_node_attribute_lines(
+        self,
+        out,
+        node_name,
+        "MatMul",
+        self._main_device, [("Identity", "simple_mul_add/u/read"),
+                            ("Identity", "simple_mul_add/v/read")], [],
+        [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [],
+        show_stack_trace=True, stack_trace_available=True)
+    check_main_menu(
+        self,
+        out,
+        list_tensors_enabled=True,
+        list_inputs_node_name=node_name,
+        print_tensor_node_name=node_name,
+        list_outputs_node_name=node_name)
+
  def testNodeInfoByTensorName(self):
    node_name = "simple_mul_add/u/read"
    tensor_name = node_name + ":0"
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@ -352,7 +352,8 @@ def get_run_short_description(run_call_count, fetches, feed_dict):
  else:
    if len(feed_dict) == 1:
      for key in feed_dict:
-        description += "1 feed (%s)" % key.name
+        description += "1 feed (%s)" % (
+            key if isinstance(key, six.string_types) else key.name)
    else:
      description += "%d feeds" % len(feed_dict)

@ -384,14 +385,19 @@ def get_error_intro(tf_error):
      intro_lines, font_attr_segs=intro_font_attr_segs)

  out.extend(
-      _recommend_command("ni %s" % op_name,
-                         "Inspect information about the failing op."))
+      _recommend_command("ni -a -d -t %s" % op_name,
+                         "Inspect information about the failing op.",
+                         create_link=True))
  out.extend(
      _recommend_command("li -r %s" % op_name,
-                         "List inputs to the failing op, recursively."))
+                         "List inputs to the failing op, recursively.",
+                         create_link=True))
+
  out.extend(
      _recommend_command(
-          "lt", "List all tensors dumped during the failing run() call."))
+          "lt",
+          "List all tensors dumped during the failing run() call.",
+          create_link=True))

  more_lines = [
      "",
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@ -227,6 +227,16 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
    command_set.add(annot[2].content)
    self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)

+  def testGetRunShortDescriptionWorksForTensorFeedKey(self):
+    short_description = cli_shared.get_run_short_description(
+        1, self.const_a, {self.const_a: 42.0})
+    self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
+
+  def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
+    short_description = cli_shared.get_run_short_description(
+        1, self.const_a, {u"foo": 42.0})
+    self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
+

 class GetErrorIntroTest(test_util.TensorFlowTestCase):

@ -247,14 +257,25 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
    self.assertEqual([(0, len(error_intro.lines[1]), "blink")],
                     error_intro.font_attr_segs[1])

-    self.assertEqual(2, error_intro.lines[4].index("ni a/Assign"))
-    self.assertEqual([(2, 13, "bold")], error_intro.font_attr_segs[4])
+    self.assertEqual(2, error_intro.lines[4].index("ni -a -d -t a/Assign"))
+    self.assertEqual(2, error_intro.font_attr_segs[4][0][0])
+    self.assertEqual(22, error_intro.font_attr_segs[4][0][1])
+    self.assertEqual("ni -a -d -t a/Assign",
+                     error_intro.font_attr_segs[4][0][2][0].content)
+    self.assertEqual("bold", error_intro.font_attr_segs[4][0][2][1])

    self.assertEqual(2, error_intro.lines[6].index("li -r a/Assign"))
-    self.assertEqual([(2, 16, "bold")], error_intro.font_attr_segs[6])
+    self.assertEqual(2, error_intro.font_attr_segs[6][0][0])
+    self.assertEqual(16, error_intro.font_attr_segs[6][0][1])
+    self.assertEqual("li -r a/Assign",
+                     error_intro.font_attr_segs[6][0][2][0].content)
+    self.assertEqual("bold", error_intro.font_attr_segs[6][0][2][1])

    self.assertEqual(2, error_intro.lines[8].index("lt"))
-    self.assertEqual([(2, 4, "bold")], error_intro.font_attr_segs[8])
+    self.assertEqual(2, error_intro.font_attr_segs[8][0][0])
+    self.assertEqual(4, error_intro.font_attr_segs[8][0][1])
+    self.assertEqual("lt", error_intro.font_attr_segs[8][0][2][0].content)
+    self.assertEqual("bold", error_intro.font_attr_segs[8][0][2][1])

    self.assertStartsWith(error_intro.lines[11], "Op name:")
    self.assertTrue(error_intro.lines[11].endswith("a/Assign"))
--- a/tensorflow/python/debug/debug_data.py
+++ b/tensorflow/python/debug/debug_data.py
@ -378,6 +378,8 @@ class DebugDumpDir(object):
    self._create_tensor_watch_maps()
    self._load_partition_graphs(partition_graphs, validate)

+    self._python_graph = None
+
  def _load_dumps(self, dump_root):
    """Load DebugTensorDatum instances from the dump root.

@ -471,6 +473,23 @@ class DebugDumpDir(object):
        self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
                                                            self._t0)

+  def set_python_graph(self, python_graph):
+    """Provide Python `Graph` object to the wrapper.
+
+    Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`
+    is a Python object and carries additional information such as the traceback
+    of nodes in the graph.
+
+    Args:
+      python_graph: (ops.Graph) The Python Graph object.
+    """
+
+    self._python_graph = python_graph
+    self._node_traceback = {}
+    if self._python_graph:
+      for op in self._python_graph.get_operations():
+        self._node_traceback[op.name] = op.traceback
+
  @property
  def dumped_tensor_data(self):
    return self._dump_tensor_data
@ -1136,3 +1155,27 @@ class DebugDumpDir(object):
                       watch_key)

    return self._watch_key_to_rel_time[watch_key]
+
+  def node_traceback(self, element_name):
+    """Try to retrieve the Python traceback of node's construction.
+
+    Args:
+      element_name: (str) Name of a graph element (node or tensor).
+
+    Returns:
+      (list) The traceback list object as returned by the `extract_trace`
+        method of Python's traceback module.
+
+    Raises:
+      LookupError: If Python graph is not available for traceback lookup.
+      KeyError: If the node cannot be found in the Python graph loaded.
+    """
+
+    if self._python_graph is None:
+      raise LookupError("Python graph is not available for traceback lookup")
+
+    node_name = get_node_name(element_name)
+    if node_name not in self._node_traceback:
+      raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)
+
+    return self._node_traceback[node_name]
--- a/tensorflow/python/debug/session_debug_testlib.py
+++ b/tensorflow/python/debug/session_debug_testlib.py
@ -932,6 +932,54 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
      self.assertTrue(np.isnan(numeric_summary[10]))
      self.assertTrue(np.isnan(numeric_summary[11]))

+  def testLookUpNodePythonTracebackWorks(self):
+    with session.Session() as sess:
+      u_init = constant_op.constant(10.0)
+      u = variables.Variable(u_init, name="traceback/u")
+      v_init = constant_op.constant(20.0)
+      v = variables.Variable(v_init, name="traceback/v")
+
+      w = math_ops.multiply(u, v, name="traceback/w")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_metadata = config_pb2.RunMetadata()
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options, sess.graph, debug_urls=self._debug_urls())
+
+      sess.run(w, options=run_options, run_metadata=run_metadata)
+      dump = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+
+      # Prior to setting the Python graph, attempts to do traceback lookup
+      # should lead to exceptions.
+      with self.assertRaisesRegexp(
+          LookupError, "Python graph is not available for traceback lookup"):
+        dump.node_traceback("traceback/w")
+
+      dump.set_python_graph(sess.graph)
+
+      # After setting the Python graph, attempts to look up nonexistent nodes
+      # should lead to exceptions.
+      with self.assertRaisesRegexp(
+          KeyError, r"Cannot find node \"foo\" in Python graph"):
+        dump.node_traceback("foo")
+
+      # Lookup should work with node name input.
+      traceback = dump.node_traceback("traceback/w")
+      self.assertIsInstance(traceback, list)
+      self.assertGreater(len(traceback), 0)
+      for trace in traceback:
+        self.assertIsInstance(trace, tuple)
+
+      # Lookup should also work with tensor name input.
+      traceback = dump.node_traceback("traceback/w:0")
+      self.assertIsInstance(traceback, list)
+      self.assertGreater(len(traceback), 0)
+      for trace in traceback:
+        self.assertIsInstance(trace, tuple)
+

 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@ -257,6 +257,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):

      debug_dump = debug_data.DebugDumpDir(
          self._dump_root, partition_graphs=partition_graphs)
+      debug_dump.set_python_graph(self._sess.graph)

      passed_filter = None
      if self._active_tensor_filter: