tfdbg: fix a bug in graph-dump causality validation algorithm
The bug has to do with how node vs. tensor names are handled by the pending input tracker. The issue was as follows: suppose a node called A produces two outputs on two slots, A:0 and A:1. B receives inputs from A:0 and C receives input from A:1. Suppose only the two following tensors are watched by the debugger: {A:0, C:0}, then there is no guarantee that the debug tensor for A:0 is dumped before the debug tensor of C:0, because C doesn't receive input from A:0 (it receives input from A:1!). As a result, during causality validation, the debugger may sometimes see that C:0 is dumped before A:0, thinking its a causality violation. This CL fixes the bug by tracking the inputs down to the level of output slots, instead of just at the level of nodes. Hitchhiking changes: * Minor change in title style of the offline debugger. Change: 142593110
This commit is contained in:
parent
a9e1619889
commit
d2b92f24e0
@ -29,6 +29,9 @@ FLAGS = flags.FLAGS
|
|||||||
flags.DEFINE_string("dump_dir", "", "tfdbg dump directory to load")
|
flags.DEFINE_string("dump_dir", "", "tfdbg dump directory to load")
|
||||||
flags.DEFINE_boolean(
|
flags.DEFINE_boolean(
|
||||||
"log_usage", True, "Whether the usage of this tool is to be logged")
|
"log_usage", True, "Whether the usage of this tool is to be logged")
|
||||||
|
flags.DEFINE_boolean(
|
||||||
|
"validate_graph", True,
|
||||||
|
"Whether the dumped tensors will be validated against the GraphDefs")
|
||||||
|
|
||||||
|
|
||||||
def main(_):
|
def main(_):
|
||||||
@ -41,13 +44,14 @@ def main(_):
|
|||||||
|
|
||||||
print("tfdbg offline: FLAGS.dump_dir = %s" % FLAGS.dump_dir)
|
print("tfdbg offline: FLAGS.dump_dir = %s" % FLAGS.dump_dir)
|
||||||
|
|
||||||
debug_dump = debug_data.DebugDumpDir(FLAGS.dump_dir)
|
debug_dump = debug_data.DebugDumpDir(
|
||||||
|
FLAGS.dump_dir, validate=FLAGS.validate_graph)
|
||||||
cli = analyzer_cli.create_analyzer_curses_cli(
|
cli = analyzer_cli.create_analyzer_curses_cli(
|
||||||
debug_dump,
|
debug_dump,
|
||||||
tensor_filters={"has_inf_or_nan": debug_data.has_inf_or_nan})
|
tensor_filters={"has_inf_or_nan": debug_data.has_inf_or_nan})
|
||||||
|
|
||||||
title = "tfdbg offline @ %s" % FLAGS.dump_dir
|
title = "tfdbg offline @ %s" % FLAGS.dump_dir
|
||||||
cli.run_ui(title=title, init_command="lt")
|
cli.run_ui(title=title, title_color="black_on_white", init_command="lt")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -98,6 +98,21 @@ def _get_node_name(element_name):
|
|||||||
return element_name.split(":")[0] if ":" in element_name else element_name
|
return element_name.split(":")[0] if ":" in element_name else element_name
|
||||||
|
|
||||||
|
|
||||||
|
def _get_output_slot(element_name):
|
||||||
|
"""Get the output slot number from the name of a graph element.
|
||||||
|
|
||||||
|
If element_name is a node name without output slot at the end, 0 will be
|
||||||
|
assumed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element_name: (str) name of the graph element in question.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(int) output slot number.
|
||||||
|
"""
|
||||||
|
return int(element_name.split(":")[-1]) if ":" in element_name else 0
|
||||||
|
|
||||||
|
|
||||||
def _get_tensor_name(node_name, output_slot):
|
def _get_tensor_name(node_name, output_slot):
|
||||||
"""Get tensor name given node name and output slot index.
|
"""Get tensor name given node name and output slot index.
|
||||||
|
|
||||||
@ -684,12 +699,16 @@ class DebugDumpDir(object):
|
|||||||
pending_inputs[node] = []
|
pending_inputs[node] = []
|
||||||
inputs = self._node_inputs[node]
|
inputs = self._node_inputs[node]
|
||||||
for inp in inputs:
|
for inp in inputs:
|
||||||
inp = _get_node_name(inp)
|
inp_node = _get_node_name(inp)
|
||||||
if inp in self._debug_watches:
|
inp_output_slot = _get_output_slot(inp)
|
||||||
pending_inputs[node].append(inp)
|
if (inp_node in self._debug_watches and
|
||||||
|
inp_output_slot in self._debug_watches[inp_node] and
|
||||||
|
(inp_node, inp_output_slot) not in pending_inputs[node]):
|
||||||
|
pending_inputs[node].append((inp_node, inp_output_slot))
|
||||||
|
|
||||||
for datum in self._dump_tensor_data:
|
for datum in self._dump_tensor_data:
|
||||||
node = datum.node_name
|
node = datum.node_name
|
||||||
|
slot = datum.output_slot
|
||||||
if pending_inputs[node]:
|
if pending_inputs[node]:
|
||||||
raise ValueError("Causality violated in timing relations of debug "
|
raise ValueError("Causality violated in timing relations of debug "
|
||||||
"dumps: %s (%d): "
|
"dumps: %s (%d): "
|
||||||
@ -699,13 +718,14 @@ class DebugDumpDir(object):
|
|||||||
recipients = self._node_recipients[node]
|
recipients = self._node_recipients[node]
|
||||||
for recipient in recipients:
|
for recipient in recipients:
|
||||||
recipient_pending_inputs = pending_inputs[recipient]
|
recipient_pending_inputs = pending_inputs[recipient]
|
||||||
if node in recipient_pending_inputs:
|
if (node, slot) in recipient_pending_inputs:
|
||||||
if self.node_op_type(recipient) == "Merge":
|
if self.node_op_type(recipient) == "Merge":
|
||||||
# If this is a Merge op, we automatically clear the list because
|
# If this is a Merge op, we automatically clear the list because
|
||||||
# a Merge node only requires one of its two inputs.
|
# a Merge node only requires one of its two inputs.
|
||||||
del recipient_pending_inputs[:]
|
del recipient_pending_inputs[:]
|
||||||
else:
|
else:
|
||||||
del recipient_pending_inputs[recipient_pending_inputs.index(node)]
|
del recipient_pending_inputs[
|
||||||
|
recipient_pending_inputs.index((node, slot))]
|
||||||
|
|
||||||
def loaded_partition_graphs(self):
|
def loaded_partition_graphs(self):
|
||||||
"""Test whether partition graphs have been loaded."""
|
"""Test whether partition graphs have been loaded."""
|
||||||
|
@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
|
|||||||
from tensorflow.python.framework import errors
|
from tensorflow.python.framework import errors
|
||||||
from tensorflow.python.framework import ops
|
from tensorflow.python.framework import ops
|
||||||
from tensorflow.python.framework import test_util
|
from tensorflow.python.framework import test_util
|
||||||
|
from tensorflow.python.ops import array_ops
|
||||||
from tensorflow.python.ops import control_flow_ops
|
from tensorflow.python.ops import control_flow_ops
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
from tensorflow.python.ops import state_ops
|
from tensorflow.python.ops import state_ops
|
||||||
@ -637,6 +638,44 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
|
|||||||
partition_graphs=run_metadata.partition_graphs,
|
partition_graphs=run_metadata.partition_graphs,
|
||||||
validate=False)
|
validate=False)
|
||||||
|
|
||||||
|
def testWatchingOnlyOneOfTwoOutputSlotsDoesNotLeadToCausalityFailure(self):
|
||||||
|
with session.Session() as sess:
|
||||||
|
x_name = "oneOfTwoSlots/x"
|
||||||
|
u_name = "oneOfTwoSlots/u"
|
||||||
|
v_name = "oneOfTwoSlots/v"
|
||||||
|
w_name = "oneOfTwoSlots/w"
|
||||||
|
y_name = "oneOfTwoSlots/y"
|
||||||
|
|
||||||
|
x = variables.Variable([1, 3, 3, 7], dtype=tf.int32, name=x_name)
|
||||||
|
sess.run(x.initializer)
|
||||||
|
|
||||||
|
unique_x, indices, _ = array_ops.unique_with_counts(x, name=u_name)
|
||||||
|
|
||||||
|
v = math_ops.add(unique_x, unique_x, name=v_name)
|
||||||
|
w = math_ops.add(indices, indices, name=w_name)
|
||||||
|
y = math_ops.add(w, w, name=y_name)
|
||||||
|
|
||||||
|
run_options = config_pb2.RunOptions(output_partition_graphs=True)
|
||||||
|
# Watch only the first output slot of u, even though it has two output
|
||||||
|
# slots.
|
||||||
|
debug_utils.add_debug_tensor_watch(
|
||||||
|
run_options, u_name, 0, debug_urls=self._debug_urls())
|
||||||
|
debug_utils.add_debug_tensor_watch(
|
||||||
|
run_options, w_name, 0, debug_urls=self._debug_urls())
|
||||||
|
debug_utils.add_debug_tensor_watch(
|
||||||
|
run_options, y_name, 0, debug_urls=self._debug_urls())
|
||||||
|
|
||||||
|
run_metadata = config_pb2.RunMetadata()
|
||||||
|
sess.run([v, y], options=run_options, run_metadata=run_metadata)
|
||||||
|
|
||||||
|
dump = debug_data.DebugDumpDir(
|
||||||
|
self._dump_root,
|
||||||
|
partition_graphs=run_metadata.partition_graphs,
|
||||||
|
validate=True)
|
||||||
|
|
||||||
|
self.assertAllClose([1, 3, 7],
|
||||||
|
dump.get_tensors(u_name, 0, "DebugIdentity")[0])
|
||||||
|
|
||||||
def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
|
def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
|
||||||
"""Test watching output slots not attached to any outgoing edges."""
|
"""Test watching output slots not attached to any outgoing edges."""
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user