From d2b92f24e072d0bb39584a9e6bee049edb49f905 Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Tue, 20 Dec 2016 13:52:54 -0800 Subject: [PATCH] tfdbg: fix a bug in graph-dump causality validation algorithm The bug has to do with how node vs. tensor names are handled by the pending input tracker. The issue was as follows: suppose a node called A produces two outputs on two slots, A:0 and A:1. B receives inputs from A:0 and C receives input from A:1. Suppose only the two following tensors are watched by the debugger: {A:0, C:0}, then there is no guarantee that the debug tensor for A:0 is dumped before the debug tensor of C:0, because C doesn't receive input from A:0 (it receives input from A:1!). As a result, during causality validation, the debugger may sometimes see that C:0 is dumped before A:0, thinking its a causality violation. This CL fixes the bug by tracking the inputs down to the level of output slots, instead of just at the level of nodes. Hitchhiking changes: * Minor change in title style of the offline debugger. Change: 142593110 --- .../python/debug/cli/offline_analyzer.py | 8 +++- tensorflow/python/debug/debug_data.py | 30 +++++++++++--- .../python/debug/session_debug_testlib.py | 39 +++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/debug/cli/offline_analyzer.py b/tensorflow/python/debug/cli/offline_analyzer.py index 38e6a63d7d9..fadb59ea79f 100644 --- a/tensorflow/python/debug/cli/offline_analyzer.py +++ b/tensorflow/python/debug/cli/offline_analyzer.py @@ -29,6 +29,9 @@ FLAGS = flags.FLAGS flags.DEFINE_string("dump_dir", "", "tfdbg dump directory to load") flags.DEFINE_boolean( "log_usage", True, "Whether the usage of this tool is to be logged") +flags.DEFINE_boolean( + "validate_graph", True, + "Whether the dumped tensors will be validated against the GraphDefs") def main(_): @@ -41,13 +44,14 @@ def main(_): print("tfdbg offline: FLAGS.dump_dir = %s" % FLAGS.dump_dir) - debug_dump = debug_data.DebugDumpDir(FLAGS.dump_dir) + debug_dump = debug_data.DebugDumpDir( + FLAGS.dump_dir, validate=FLAGS.validate_graph) cli = analyzer_cli.create_analyzer_curses_cli( debug_dump, tensor_filters={"has_inf_or_nan": debug_data.has_inf_or_nan}) title = "tfdbg offline @ %s" % FLAGS.dump_dir - cli.run_ui(title=title, init_command="lt") + cli.run_ui(title=title, title_color="black_on_white", init_command="lt") if __name__ == "__main__": diff --git a/tensorflow/python/debug/debug_data.py b/tensorflow/python/debug/debug_data.py index e4097d5a0e2..4c61eaeca72 100644 --- a/tensorflow/python/debug/debug_data.py +++ b/tensorflow/python/debug/debug_data.py @@ -98,6 +98,21 @@ def _get_node_name(element_name): return element_name.split(":")[0] if ":" in element_name else element_name +def _get_output_slot(element_name): + """Get the output slot number from the name of a graph element. + + If element_name is a node name without output slot at the end, 0 will be + assumed. + + Args: + element_name: (str) name of the graph element in question. + + Returns: + (int) output slot number. + """ + return int(element_name.split(":")[-1]) if ":" in element_name else 0 + + def _get_tensor_name(node_name, output_slot): """Get tensor name given node name and output slot index. @@ -684,12 +699,16 @@ class DebugDumpDir(object): pending_inputs[node] = [] inputs = self._node_inputs[node] for inp in inputs: - inp = _get_node_name(inp) - if inp in self._debug_watches: - pending_inputs[node].append(inp) + inp_node = _get_node_name(inp) + inp_output_slot = _get_output_slot(inp) + if (inp_node in self._debug_watches and + inp_output_slot in self._debug_watches[inp_node] and + (inp_node, inp_output_slot) not in pending_inputs[node]): + pending_inputs[node].append((inp_node, inp_output_slot)) for datum in self._dump_tensor_data: node = datum.node_name + slot = datum.output_slot if pending_inputs[node]: raise ValueError("Causality violated in timing relations of debug " "dumps: %s (%d): " @@ -699,13 +718,14 @@ class DebugDumpDir(object): recipients = self._node_recipients[node] for recipient in recipients: recipient_pending_inputs = pending_inputs[recipient] - if node in recipient_pending_inputs: + if (node, slot) in recipient_pending_inputs: if self.node_op_type(recipient) == "Merge": # If this is a Merge op, we automatically clear the list because # a Merge node only requires one of its two inputs. del recipient_pending_inputs[:] else: - del recipient_pending_inputs[recipient_pending_inputs.index(node)] + del recipient_pending_inputs[ + recipient_pending_inputs.index((node, slot))] def loaded_partition_graphs(self): """Test whether partition graphs have been loaded.""" diff --git a/tensorflow/python/debug/session_debug_testlib.py b/tensorflow/python/debug/session_debug_testlib.py index 7e5fffbc3a1..b2ff9239e54 100644 --- a/tensorflow/python/debug/session_debug_testlib.py +++ b/tensorflow/python/debug/session_debug_testlib.py @@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import state_ops @@ -637,6 +638,44 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): partition_graphs=run_metadata.partition_graphs, validate=False) + def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self): + with session.Session() as sess: + x_name = "oneOfTwoSlots/x" + u_name = "oneOfTwoSlots/u" + v_name = "oneOfTwoSlots/v" + w_name = "oneOfTwoSlots/w" + y_name = "oneOfTwoSlots/y" + + x = variables.Variable([1, 3, 3, 7], dtype=tf.int32, name=x_name) + sess.run(x.initializer) + + unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name) + + v = math_ops.add(unique_x, unique_x, name=v_name) + w = math_ops.add(indices, indices, name=w_name) + y = math_ops.add(w, w, name=y_name) + + run_options = config_pb2.RunOptions(output_partition_graphs=True) + # Watch only the first output slot of u, even though it has two output + # slots. + debug_utils.add_debug_tensor_watch( + run_options, u_name, 0, debug_urls=self._debug_urls()) + debug_utils.add_debug_tensor_watch( + run_options, w_name, 0, debug_urls=self._debug_urls()) + debug_utils.add_debug_tensor_watch( + run_options, y_name, 0, debug_urls=self._debug_urls()) + + run_metadata = config_pb2.RunMetadata() + sess.run([v, y], options=run_options, run_metadata=run_metadata) + + dump = debug_data.DebugDumpDir( + self._dump_root, + partition_graphs=run_metadata.partition_graphs, + validate=True) + + self.assertAllClose([1, 3, 7], + dump.get_tensors(u_name, 0, "DebugIdentity")[0]) + def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self): """Test watching output slots not attached to any outgoing edges."""