From d2b92f24e072d0bb39584a9e6bee049edb49f905 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 20 Dec 2016 13:52:54 -0800
Subject: [PATCH] tfdbg: fix a bug in graph-dump causality validation algorithm

The bug has to do with how node vs. tensor names are handled by the pending input tracker. The issue was as follows: suppose a node called A produces two outputs on two slots, A:0 and A:1. B receives inputs from A:0 and C receives input from A:1.

Suppose only the two following tensors are watched by the debugger: {A:0, C:0}, then there is no guarantee that the debug tensor for A:0 is dumped before the debug tensor of C:0, because C doesn't receive input from A:0 (it receives input from A:1!). As a result, during causality validation, the debugger may sometimes see that C:0 is dumped before A:0, thinking its a causality violation.

This CL fixes the bug by tracking the inputs down to the level of output slots, instead of just at the level of nodes.

Hitchhiking changes:
* Minor change in title style of the offline debugger.
Change: 142593110
---
 .../python/debug/cli/offline_analyzer.py      |  8 +++-
 tensorflow/python/debug/debug_data.py         | 30 +++++++++++---
 .../python/debug/session_debug_testlib.py     | 39 +++++++++++++++++++
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/debug/cli/offline_analyzer.py b/tensorflow/python/debug/cli/offline_analyzer.py
index 38e6a63d7d9..fadb59ea79f 100644
--- a/tensorflow/python/debug/cli/offline_analyzer.py
+++ b/tensorflow/python/debug/cli/offline_analyzer.py
@@ -29,6 +29,9 @@ FLAGS = flags.FLAGS
 flags.DEFINE_string("dump_dir", "", "tfdbg dump directory to load")
 flags.DEFINE_boolean(
     "log_usage", True, "Whether the usage of this tool is to be logged")
+flags.DEFINE_boolean(
+    "validate_graph", True,
+    "Whether the dumped tensors will be validated against the GraphDefs")
 
 
 def main(_):
@@ -41,13 +44,14 @@ def main(_):
 
   print("tfdbg offline: FLAGS.dump_dir = %s" % FLAGS.dump_dir)
 
-  debug_dump = debug_data.DebugDumpDir(FLAGS.dump_dir)
+  debug_dump = debug_data.DebugDumpDir(
+      FLAGS.dump_dir, validate=FLAGS.validate_graph)
   cli = analyzer_cli.create_analyzer_curses_cli(
       debug_dump,
       tensor_filters={"has_inf_or_nan": debug_data.has_inf_or_nan})
 
   title = "tfdbg offline @ %s" % FLAGS.dump_dir
-  cli.run_ui(title=title, init_command="lt")
+  cli.run_ui(title=title, title_color="black_on_white", init_command="lt")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/debug_data.py b/tensorflow/python/debug/debug_data.py
index e4097d5a0e2..4c61eaeca72 100644
--- a/tensorflow/python/debug/debug_data.py
+++ b/tensorflow/python/debug/debug_data.py
@@ -98,6 +98,21 @@ def _get_node_name(element_name):
   return element_name.split(":")[0] if ":" in element_name else element_name
 
 
+def _get_output_slot(element_name):
+  """Get the output slot number from the name of a graph element.
+
+  If element_name is a node name without output slot at the end, 0 will be
+  assumed.
+
+  Args:
+    element_name: (str) name of the graph element in question.
+
+  Returns:
+    (int) output slot number.
+  """
+  return int(element_name.split(":")[-1]) if ":" in element_name else 0
+
+
 def _get_tensor_name(node_name, output_slot):
   """Get tensor name given node name and output slot index.
 
@@ -684,12 +699,16 @@ class DebugDumpDir(object):
       pending_inputs[node] = []
       inputs = self._node_inputs[node]
       for inp in inputs:
-        inp = _get_node_name(inp)
-        if inp in self._debug_watches:
-          pending_inputs[node].append(inp)
+        inp_node = _get_node_name(inp)
+        inp_output_slot = _get_output_slot(inp)
+        if (inp_node in self._debug_watches and
+            inp_output_slot in self._debug_watches[inp_node] and
+            (inp_node, inp_output_slot) not in pending_inputs[node]):
+          pending_inputs[node].append((inp_node, inp_output_slot))
 
     for datum in self._dump_tensor_data:
       node = datum.node_name
+      slot = datum.output_slot
       if pending_inputs[node]:
         raise ValueError("Causality violated in timing relations of debug "
                          "dumps: %s (%d): "
@@ -699,13 +718,14 @@ class DebugDumpDir(object):
       recipients = self._node_recipients[node]
       for recipient in recipients:
         recipient_pending_inputs = pending_inputs[recipient]
-        if node in recipient_pending_inputs:
+        if (node, slot) in recipient_pending_inputs:
           if self.node_op_type(recipient) == "Merge":
             # If this is a Merge op, we automatically clear the list because
             # a Merge node only requires one of its two inputs.
             del recipient_pending_inputs[:]
           else:
-            del recipient_pending_inputs[recipient_pending_inputs.index(node)]
+            del recipient_pending_inputs[
+                recipient_pending_inputs.index((node, slot))]
 
   def loaded_partition_graphs(self):
     """Test whether partition graphs have been loaded."""
diff --git a/tensorflow/python/debug/session_debug_testlib.py b/tensorflow/python/debug/session_debug_testlib.py
index 7e5fffbc3a1..b2ff9239e54 100644
--- a/tensorflow/python/debug/session_debug_testlib.py
+++ b/tensorflow/python/debug/session_debug_testlib.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -637,6 +638,44 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
           partition_graphs=run_metadata.partition_graphs,
           validate=False)
 
+  def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self):
+    with session.Session() as sess:
+      x_name = "oneOfTwoSlots/x"
+      u_name = "oneOfTwoSlots/u"
+      v_name = "oneOfTwoSlots/v"
+      w_name = "oneOfTwoSlots/w"
+      y_name = "oneOfTwoSlots/y"
+
+      x = variables.Variable([1, 3, 3, 7], dtype=tf.int32, name=x_name)
+      sess.run(x.initializer)
+
+      unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
+
+      v = math_ops.add(unique_x, unique_x, name=v_name)
+      w = math_ops.add(indices, indices, name=w_name)
+      y = math_ops.add(w, w, name=y_name)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      # Watch only the first output slot of u, even though it has two output
+      # slots.
+      debug_utils.add_debug_tensor_watch(
+          run_options, u_name, 0, debug_urls=self._debug_urls())
+      debug_utils.add_debug_tensor_watch(
+          run_options, w_name, 0, debug_urls=self._debug_urls())
+      debug_utils.add_debug_tensor_watch(
+          run_options, y_name, 0, debug_urls=self._debug_urls())
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run([v, y], options=run_options, run_metadata=run_metadata)
+
+      dump = debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=run_metadata.partition_graphs,
+          validate=True)
+
+      self.assertAllClose([1, 3, 7],
+                          dump.get_tensors(u_name, 0, "DebugIdentity")[0])
+
   def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
     """Test watching output slots not attached to any outgoing edges."""