[tfdbg2] Various improvements to DebugDataReader for DebuggerV2
This is related to https://github.com/tensorflow/tensorboard/pull/3564 1. Add DebuggedGraph.get_op_creation_digest() 2. Remove DebuggedGraph.get_op_type(), which is superseded by DebuggedGraph.get_op_creation_digest() and is not used anywhere. 3. Add DebuggedGraph.add_op_consumers() and DebuggedGraph.get_op_consumers() to enable efficient tracking of the downstream consuming ops of a graph op. 4. Add host_name and stack_frame_ids to data class GraphOpCreationDigest. PiperOrigin-RevId: 309455936 Change-Id: I104084c1ef8b887f69733702a2f4c3190fa5402f
This commit is contained in:
parent
6232075ca8
commit
958fbebe70
@ -410,6 +410,8 @@ class DebuggedGraph(object):
|
||||
self._inner_graph_ids = []
|
||||
# A dictionary from op name to GraphOpCreationDigest.
|
||||
self._op_by_name = dict()
|
||||
# A dictionary mapping op to immediate downstream consumers.
|
||||
self._op_consumers = collections.defaultdict(list)
|
||||
|
||||
def add_inner_graph_id(self, inner_graph_id):
|
||||
"""Add the debugger-generated ID of a graph nested within this graph.
|
||||
@ -434,6 +436,18 @@ class DebuggedGraph(object):
|
||||
self._op_by_name[
|
||||
graph_op_creation_digest.op_name] = graph_op_creation_digest
|
||||
|
||||
def add_op_consumer(self, src_op_name, src_slot, dst_op_name, dst_slot):
|
||||
"""Add a consuming op for this op.
|
||||
|
||||
Args:
|
||||
src_op_name: Name of the op of which the output tensor is being consumed.
|
||||
src_slot: 0-based output slot of the op being consumed.
|
||||
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
|
||||
dst_slot: 0-based input slot of the consuming op that receives the tensor
|
||||
from this op.
|
||||
"""
|
||||
self._op_consumers[src_op_name].append((src_slot, dst_op_name, dst_slot))
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return self._name
|
||||
@ -450,13 +464,33 @@ class DebuggedGraph(object):
|
||||
def inner_graph_ids(self):
|
||||
return self._inner_graph_ids
|
||||
|
||||
def get_op_type(self, op_name):
|
||||
return self._op_by_name[op_name].op_type
|
||||
|
||||
def get_tensor_id(self, op_name, output_slot):
|
||||
"""Get the ID of a symbolic tensor in this graph."""
|
||||
return self._op_by_name[op_name].output_tensor_ids[output_slot]
|
||||
|
||||
def get_op_creation_digest(self, op_name):
|
||||
"""Get the GraphOpCreationDigest for a op in the graph."""
|
||||
return self._op_by_name[op_name]
|
||||
|
||||
def get_op_consumers(self, src_op_name):
|
||||
"""Get all the downstream consumers of this op.
|
||||
|
||||
Only data (non-control) edges are tracked.
|
||||
|
||||
Args:
|
||||
src_op_name: Name of the op providing the tensor being consumed.
|
||||
|
||||
Returns:
|
||||
A list of (src_slot, dst_op_name, dst_slot) tuples. In each item of
|
||||
the list:
|
||||
src_slot: 0-based output slot of the op of which the output tensor
|
||||
is being consumed.
|
||||
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
|
||||
dst_slot: 0-based input slot of the consuming op that receives
|
||||
the tensor from this op.
|
||||
"""
|
||||
return self._op_consumers[src_op_name]
|
||||
|
||||
# TODO(cais): Implement to_json().
|
||||
|
||||
|
||||
@ -500,6 +534,9 @@ class GraphOpCreationDigest(BaseDigest):
|
||||
output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
|
||||
input_names: Names of the input tensors to the op.
|
||||
device_name: The name of the device that the op is placed on (if available).
|
||||
host_name: Name of the host on which the op is created.
|
||||
stack_frame_ids: IDs of the frames of the stack trace at which the op
|
||||
is created.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@ -509,6 +546,8 @@ class GraphOpCreationDigest(BaseDigest):
|
||||
op_type,
|
||||
op_name,
|
||||
output_tensor_ids,
|
||||
host_name,
|
||||
stack_frame_ids,
|
||||
input_names=None,
|
||||
device_name=None):
|
||||
super(GraphOpCreationDigest, self).__init__(wall_time, offset)
|
||||
@ -516,6 +555,8 @@ class GraphOpCreationDigest(BaseDigest):
|
||||
self._op_type = op_type
|
||||
self._op_name = op_name
|
||||
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
|
||||
self._host_name = host_name
|
||||
self._stack_frame_ids = stack_frame_ids
|
||||
self._input_names = _tuple_or_none(input_names)
|
||||
self._device_name = device_name
|
||||
|
||||
@ -547,6 +588,14 @@ class GraphOpCreationDigest(BaseDigest):
|
||||
def device_name(self):
|
||||
return self._device_name
|
||||
|
||||
@property
|
||||
def host_name(self):
|
||||
return self._host_name
|
||||
|
||||
@property
|
||||
def stack_frame_ids(self):
|
||||
return self._stack_frame_ids
|
||||
|
||||
def to_json(self):
|
||||
output = super(GraphOpCreationDigest, self).to_json()
|
||||
output.update({
|
||||
@ -554,6 +603,8 @@ class GraphOpCreationDigest(BaseDigest):
|
||||
"op_type": self.op_type,
|
||||
"op_name": self.op_name,
|
||||
"output_tensor_ids": self.output_tensor_ids,
|
||||
"host_name": self.host_name,
|
||||
"stack_frame_ids": self.stack_frame_ids,
|
||||
"input_names": self.input_names,
|
||||
"device_name": self.device_name,
|
||||
})
|
||||
@ -849,9 +900,17 @@ class DebugDataReader(object):
|
||||
op_creation_proto.op_type,
|
||||
op_creation_proto.op_name,
|
||||
tuple(op_creation_proto.output_tensor_ids),
|
||||
op_creation_proto.code_location.host_name,
|
||||
tuple(op_creation_proto.code_location.stack_frame_ids),
|
||||
input_names=tuple(op_creation_proto.input_names))
|
||||
self._graph_op_digests.append(op_digest)
|
||||
self._graph_by_id[op_creation_proto.graph_id].add_op(op_digest)
|
||||
debugged_graph = self._graph_by_id[op_creation_proto.graph_id]
|
||||
debugged_graph.add_op(op_digest)
|
||||
for dst_slot, input_name in enumerate(op_creation_proto.input_names):
|
||||
src_op_name, src_slot = input_name.split(":")
|
||||
debugged_graph.add_op_consumer(src_op_name, int(src_slot),
|
||||
op_creation_proto.op_name, dst_slot)
|
||||
|
||||
elif debug_event.debugged_graph.ByteSize():
|
||||
graph_proto = debug_event.debugged_graph
|
||||
graph = DebuggedGraph(
|
||||
@ -936,7 +995,7 @@ class DebugDataReader(object):
|
||||
Returns:
|
||||
Op type as a str.
|
||||
"""
|
||||
return self._graph_by_id[graph_id].get_op_type(op_name)
|
||||
return self._graph_by_id[graph_id].get_op_creation_digest(op_name).op_type
|
||||
|
||||
def _load_execution(self):
|
||||
"""Incrementally read the .execution file."""
|
||||
@ -1136,13 +1195,10 @@ class DebugDataReader(object):
|
||||
1. The host name.
|
||||
2. The stack trace, as a list of (file_path, lineno, func) tuples.
|
||||
"""
|
||||
debug_event = self._reader.read_graphs_event(
|
||||
graph_op_creation_digest.offset)
|
||||
graph_op_creation = debug_event.graph_op_creation
|
||||
host_name = graph_op_creation.code_location.host_name
|
||||
return host_name, [
|
||||
return graph_op_creation_digest.host_name, [
|
||||
self._stack_frame_by_id[frame_id][1:]
|
||||
for frame_id in graph_op_creation.code_location.stack_frame_ids]
|
||||
for frame_id in graph_op_creation_digest.stack_frame_ids
|
||||
]
|
||||
|
||||
# TODO(cais): Add graph_execution_digests() with an ExecutionDigest
|
||||
# as a kwarg, to establish the association between top-level and intra-graph
|
||||
|
@ -662,8 +662,14 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
||||
|
||||
def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
|
||||
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
||||
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
|
||||
[135], input_names=None, device_name=None)
|
||||
1234,
|
||||
5678,
|
||||
"deadbeef",
|
||||
"FooOp",
|
||||
"Model_1/Foo_2", [135],
|
||||
"machine.cluster", ("a1", "a2"),
|
||||
input_names=None,
|
||||
device_name=None)
|
||||
json = op_creation_digest.to_json()
|
||||
self.jsonRoundTripCheck(json)
|
||||
self.assertEqual(json["wall_time"], 1234)
|
||||
@ -671,13 +677,21 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
||||
self.assertEqual(json["op_type"], "FooOp")
|
||||
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
||||
self.assertEqual(json["output_tensor_ids"], (135,))
|
||||
self.assertEqual(json["host_name"], "machine.cluster")
|
||||
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
|
||||
self.assertIsNone(json["input_names"])
|
||||
self.assertIsNone(json["device_name"])
|
||||
|
||||
def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
|
||||
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
||||
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
|
||||
[135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
|
||||
1234,
|
||||
5678,
|
||||
"deadbeef",
|
||||
"FooOp",
|
||||
"Model_1/Foo_2", [135],
|
||||
"machine.cluster", ("a1", "a2"),
|
||||
input_names=["Bar_1", "Qux_2"],
|
||||
device_name="/device:GPU:0")
|
||||
json = op_creation_digest.to_json()
|
||||
self.jsonRoundTripCheck(json)
|
||||
self.assertEqual(json["wall_time"], 1234)
|
||||
@ -685,6 +699,8 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
||||
self.assertEqual(json["op_type"], "FooOp")
|
||||
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
||||
self.assertEqual(json["output_tensor_ids"], (135,))
|
||||
self.assertEqual(json["host_name"], "machine.cluster")
|
||||
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
|
||||
self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
|
||||
self.assertEqual(json["device_name"], "/device:GPU:0")
|
||||
|
||||
|
@ -756,6 +756,63 @@ class DumpingCallbackTest(
|
||||
non_placeholder_full_tensor_values[3],
|
||||
np.sin(np.log(5.0) + 1.0)) # Sin op.
|
||||
|
||||
@parameterized.named_parameters(
|
||||
("NoTensor", "NO_TENSOR"),
|
||||
("FullTensor", "FULL_TENSOR"),
|
||||
)
|
||||
@test_util.run_in_graph_and_eager_modes
|
||||
def testGraphOpConsumingRelationIsCaptured(self, tensor_debug_mode):
|
||||
writer = dumping_callback.enable_dump_debug_info(
|
||||
self.dump_root, tensor_debug_mode=tensor_debug_mode)
|
||||
|
||||
@def_function.function
|
||||
def log_sum(x, y):
|
||||
return math_ops.log(x + y)
|
||||
|
||||
@def_function.function
|
||||
def maxindex_sin1p_log_sum(x, y):
|
||||
_, indices = array_ops.unique(math_ops.sin(1.0 + log_sum(x, y)))
|
||||
return math_ops.reduce_max(indices)
|
||||
|
||||
x = constant_op.constant([2.0, 2.0])
|
||||
y = constant_op.constant([3.0, 3.0])
|
||||
maxindex = maxindex_sin1p_log_sum(x, y)
|
||||
self.assertAllEqual(maxindex, 0)
|
||||
writer.FlushNonExecutionFiles()
|
||||
writer.FlushExecutionFiles()
|
||||
|
||||
with debug_events_reader.DebugDataReader(self.dump_root) as reader:
|
||||
reader.update()
|
||||
traces = reader.graph_execution_traces()
|
||||
add_traces = [trace for trace in traces if trace.op_type == "AddV2"]
|
||||
log_traces = [trace for trace in traces if trace.op_type == "Log"]
|
||||
sin_traces = [trace for trace in traces if trace.op_type == "Sin"]
|
||||
unique_traces = [trace for trace in traces if trace.op_type == "Unique"]
|
||||
max_traces = [trace for trace in traces if trace.op_type == "Max"]
|
||||
self.assertLen(add_traces, 2)
|
||||
self.assertLen(log_traces, 1)
|
||||
self.assertLen(sin_traces, 1)
|
||||
self.assertLen(unique_traces, 2) # The Unique op outputs two tensors.
|
||||
self.assertLen(max_traces, 1)
|
||||
graph = reader.graph_by_id(add_traces[0].graph_id)
|
||||
# The first AddV2 op is consumed by the Log op.
|
||||
self.assertEqual(
|
||||
graph.get_op_consumers(add_traces[0].op_name),
|
||||
[(0, log_traces[0].op_name, 0)])
|
||||
graph = reader.graph_by_id(add_traces[1].graph_id)
|
||||
# The second AddV2 op is consumed by the Sin op.
|
||||
self.assertEqual(
|
||||
graph.get_op_consumers(add_traces[1].op_name),
|
||||
[(0, sin_traces[0].op_name, 0)])
|
||||
# The last Sin op is consumed by the Unique op.
|
||||
self.assertEqual(
|
||||
graph.get_op_consumers(sin_traces[0].op_name),
|
||||
[(0, unique_traces[0].op_name, 0)])
|
||||
# The Unique op's 2nd output tensor is consumed by the Max op.
|
||||
self.assertEqual(
|
||||
graph.get_op_consumers(unique_traces[0].op_name),
|
||||
[(1, max_traces[0].op_name, 0)])
|
||||
|
||||
def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
|
||||
"""Test correct executed IDs of two FuncGraphs from the same Py function."""
|
||||
writer = dumping_callback.enable_dump_debug_info(
|
||||
|
Loading…
Reference in New Issue
Block a user