[tfdbg2] Various improvements to DebugDataReader for DebuggerV2

This is related to https://github.com/tensorflow/tensorboard/pull/3564

1. Add DebuggedGraph.get_op_creation_digest()
2. Remove DebuggedGraph.get_op_type(), which is superseded by
   DebuggedGraph.get_op_creation_digest() and is not used anywhere.
3. Add DebuggedGraph.add_op_consumers() and DebuggedGraph.get_op_consumers()
   to enable efficient tracking of the downstream consuming ops of a graph
   op.
4. Add host_name and stack_frame_ids to data class GraphOpCreationDigest.

PiperOrigin-RevId: 309455936
Change-Id: I104084c1ef8b887f69733702a2f4c3190fa5402f
This commit is contained in:
Shanqing Cai 2020-05-01 12:45:24 -07:00 committed by TensorFlower Gardener
parent 6232075ca8
commit 958fbebe70
3 changed files with 144 additions and 15 deletions

View File

@ -410,6 +410,8 @@ class DebuggedGraph(object):
self._inner_graph_ids = []
# A dictionary from op name to GraphOpCreationDigest.
self._op_by_name = dict()
# A dictionary mapping op to immediate downstream consumers.
self._op_consumers = collections.defaultdict(list)
def add_inner_graph_id(self, inner_graph_id):
"""Add the debugger-generated ID of a graph nested within this graph.
@ -434,6 +436,18 @@ class DebuggedGraph(object):
self._op_by_name[
graph_op_creation_digest.op_name] = graph_op_creation_digest
def add_op_consumer(self, src_op_name, src_slot, dst_op_name, dst_slot):
"""Add a consuming op for this op.
Args:
src_op_name: Name of the op of which the output tensor is being consumed.
src_slot: 0-based output slot of the op being consumed.
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
dst_slot: 0-based input slot of the consuming op that receives the tensor
from this op.
"""
self._op_consumers[src_op_name].append((src_slot, dst_op_name, dst_slot))
@property
def name(self):
return self._name
@ -450,13 +464,33 @@ class DebuggedGraph(object):
def inner_graph_ids(self):
return self._inner_graph_ids
def get_op_type(self, op_name):
return self._op_by_name[op_name].op_type
def get_tensor_id(self, op_name, output_slot):
"""Get the ID of a symbolic tensor in this graph."""
return self._op_by_name[op_name].output_tensor_ids[output_slot]
def get_op_creation_digest(self, op_name):
"""Get the GraphOpCreationDigest for a op in the graph."""
return self._op_by_name[op_name]
def get_op_consumers(self, src_op_name):
"""Get all the downstream consumers of this op.
Only data (non-control) edges are tracked.
Args:
src_op_name: Name of the op providing the tensor being consumed.
Returns:
A list of (src_slot, dst_op_name, dst_slot) tuples. In each item of
the list:
src_slot: 0-based output slot of the op of which the output tensor
is being consumed.
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
dst_slot: 0-based input slot of the consuming op that receives
the tensor from this op.
"""
return self._op_consumers[src_op_name]
# TODO(cais): Implement to_json().
@ -500,6 +534,9 @@ class GraphOpCreationDigest(BaseDigest):
output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
input_names: Names of the input tensors to the op.
device_name: The name of the device that the op is placed on (if available).
host_name: Name of the host on which the op is created.
stack_frame_ids: IDs of the frames of the stack trace at which the op
is created.
"""
def __init__(self,
@ -509,6 +546,8 @@ class GraphOpCreationDigest(BaseDigest):
op_type,
op_name,
output_tensor_ids,
host_name,
stack_frame_ids,
input_names=None,
device_name=None):
super(GraphOpCreationDigest, self).__init__(wall_time, offset)
@ -516,6 +555,8 @@ class GraphOpCreationDigest(BaseDigest):
self._op_type = op_type
self._op_name = op_name
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
self._host_name = host_name
self._stack_frame_ids = stack_frame_ids
self._input_names = _tuple_or_none(input_names)
self._device_name = device_name
@ -547,6 +588,14 @@ class GraphOpCreationDigest(BaseDigest):
def device_name(self):
return self._device_name
@property
def host_name(self):
return self._host_name
@property
def stack_frame_ids(self):
return self._stack_frame_ids
def to_json(self):
output = super(GraphOpCreationDigest, self).to_json()
output.update({
@ -554,6 +603,8 @@ class GraphOpCreationDigest(BaseDigest):
"op_type": self.op_type,
"op_name": self.op_name,
"output_tensor_ids": self.output_tensor_ids,
"host_name": self.host_name,
"stack_frame_ids": self.stack_frame_ids,
"input_names": self.input_names,
"device_name": self.device_name,
})
@ -849,9 +900,17 @@ class DebugDataReader(object):
op_creation_proto.op_type,
op_creation_proto.op_name,
tuple(op_creation_proto.output_tensor_ids),
op_creation_proto.code_location.host_name,
tuple(op_creation_proto.code_location.stack_frame_ids),
input_names=tuple(op_creation_proto.input_names))
self._graph_op_digests.append(op_digest)
self._graph_by_id[op_creation_proto.graph_id].add_op(op_digest)
debugged_graph = self._graph_by_id[op_creation_proto.graph_id]
debugged_graph.add_op(op_digest)
for dst_slot, input_name in enumerate(op_creation_proto.input_names):
src_op_name, src_slot = input_name.split(":")
debugged_graph.add_op_consumer(src_op_name, int(src_slot),
op_creation_proto.op_name, dst_slot)
elif debug_event.debugged_graph.ByteSize():
graph_proto = debug_event.debugged_graph
graph = DebuggedGraph(
@ -936,7 +995,7 @@ class DebugDataReader(object):
Returns:
Op type as a str.
"""
return self._graph_by_id[graph_id].get_op_type(op_name)
return self._graph_by_id[graph_id].get_op_creation_digest(op_name).op_type
def _load_execution(self):
"""Incrementally read the .execution file."""
@ -1136,13 +1195,10 @@ class DebugDataReader(object):
1. The host name.
2. The stack trace, as a list of (file_path, lineno, func) tuples.
"""
debug_event = self._reader.read_graphs_event(
graph_op_creation_digest.offset)
graph_op_creation = debug_event.graph_op_creation
host_name = graph_op_creation.code_location.host_name
return host_name, [
return graph_op_creation_digest.host_name, [
self._stack_frame_by_id[frame_id][1:]
for frame_id in graph_op_creation.code_location.stack_frame_ids]
for frame_id in graph_op_creation_digest.stack_frame_ids
]
# TODO(cais): Add graph_execution_digests() with an ExecutionDigest
# as a kwarg, to establish the association between top-level and intra-graph

View File

@ -662,8 +662,14 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
[135], input_names=None, device_name=None)
1234,
5678,
"deadbeef",
"FooOp",
"Model_1/Foo_2", [135],
"machine.cluster", ("a1", "a2"),
input_names=None,
device_name=None)
json = op_creation_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
@ -671,13 +677,21 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_tensor_ids"], (135,))
self.assertEqual(json["host_name"], "machine.cluster")
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
self.assertIsNone(json["input_names"])
self.assertIsNone(json["device_name"])
def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
[135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
1234,
5678,
"deadbeef",
"FooOp",
"Model_1/Foo_2", [135],
"machine.cluster", ("a1", "a2"),
input_names=["Bar_1", "Qux_2"],
device_name="/device:GPU:0")
json = op_creation_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
@ -685,6 +699,8 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_tensor_ids"], (135,))
self.assertEqual(json["host_name"], "machine.cluster")
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
self.assertEqual(json["device_name"], "/device:GPU:0")

View File

@ -756,6 +756,63 @@ class DumpingCallbackTest(
non_placeholder_full_tensor_values[3],
np.sin(np.log(5.0) + 1.0)) # Sin op.
@parameterized.named_parameters(
("NoTensor", "NO_TENSOR"),
("FullTensor", "FULL_TENSOR"),
)
@test_util.run_in_graph_and_eager_modes
def testGraphOpConsumingRelationIsCaptured(self, tensor_debug_mode):
writer = dumping_callback.enable_dump_debug_info(
self.dump_root, tensor_debug_mode=tensor_debug_mode)
@def_function.function
def log_sum(x, y):
return math_ops.log(x + y)
@def_function.function
def maxindex_sin1p_log_sum(x, y):
_, indices = array_ops.unique(math_ops.sin(1.0 + log_sum(x, y)))
return math_ops.reduce_max(indices)
x = constant_op.constant([2.0, 2.0])
y = constant_op.constant([3.0, 3.0])
maxindex = maxindex_sin1p_log_sum(x, y)
self.assertAllEqual(maxindex, 0)
writer.FlushNonExecutionFiles()
writer.FlushExecutionFiles()
with debug_events_reader.DebugDataReader(self.dump_root) as reader:
reader.update()
traces = reader.graph_execution_traces()
add_traces = [trace for trace in traces if trace.op_type == "AddV2"]
log_traces = [trace for trace in traces if trace.op_type == "Log"]
sin_traces = [trace for trace in traces if trace.op_type == "Sin"]
unique_traces = [trace for trace in traces if trace.op_type == "Unique"]
max_traces = [trace for trace in traces if trace.op_type == "Max"]
self.assertLen(add_traces, 2)
self.assertLen(log_traces, 1)
self.assertLen(sin_traces, 1)
self.assertLen(unique_traces, 2) # The Unique op outputs two tensors.
self.assertLen(max_traces, 1)
graph = reader.graph_by_id(add_traces[0].graph_id)
# The first AddV2 op is consumed by the Log op.
self.assertEqual(
graph.get_op_consumers(add_traces[0].op_name),
[(0, log_traces[0].op_name, 0)])
graph = reader.graph_by_id(add_traces[1].graph_id)
# The second AddV2 op is consumed by the Sin op.
self.assertEqual(
graph.get_op_consumers(add_traces[1].op_name),
[(0, sin_traces[0].op_name, 0)])
# The last Sin op is consumed by the Unique op.
self.assertEqual(
graph.get_op_consumers(sin_traces[0].op_name),
[(0, unique_traces[0].op_name, 0)])
# The Unique op's 2nd output tensor is consumed by the Max op.
self.assertEqual(
graph.get_op_consumers(unique_traces[0].op_name),
[(1, max_traces[0].op_name, 0)])
def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
"""Test correct executed IDs of two FuncGraphs from the same Py function."""
writer = dumping_callback.enable_dump_debug_info(