[tfdbg2] Various improvements to DebugDataReader for DebuggerV2
This is related to https://github.com/tensorflow/tensorboard/pull/3564 1. Add DebuggedGraph.get_op_creation_digest() 2. Remove DebuggedGraph.get_op_type(), which is superseded by DebuggedGraph.get_op_creation_digest() and is not used anywhere. 3. Add DebuggedGraph.add_op_consumers() and DebuggedGraph.get_op_consumers() to enable efficient tracking of the downstream consuming ops of a graph op. 4. Add host_name and stack_frame_ids to data class GraphOpCreationDigest. PiperOrigin-RevId: 309455936 Change-Id: I104084c1ef8b887f69733702a2f4c3190fa5402f
This commit is contained in:
parent
6232075ca8
commit
958fbebe70
@ -410,6 +410,8 @@ class DebuggedGraph(object):
|
|||||||
self._inner_graph_ids = []
|
self._inner_graph_ids = []
|
||||||
# A dictionary from op name to GraphOpCreationDigest.
|
# A dictionary from op name to GraphOpCreationDigest.
|
||||||
self._op_by_name = dict()
|
self._op_by_name = dict()
|
||||||
|
# A dictionary mapping op to immediate downstream consumers.
|
||||||
|
self._op_consumers = collections.defaultdict(list)
|
||||||
|
|
||||||
def add_inner_graph_id(self, inner_graph_id):
|
def add_inner_graph_id(self, inner_graph_id):
|
||||||
"""Add the debugger-generated ID of a graph nested within this graph.
|
"""Add the debugger-generated ID of a graph nested within this graph.
|
||||||
@ -434,6 +436,18 @@ class DebuggedGraph(object):
|
|||||||
self._op_by_name[
|
self._op_by_name[
|
||||||
graph_op_creation_digest.op_name] = graph_op_creation_digest
|
graph_op_creation_digest.op_name] = graph_op_creation_digest
|
||||||
|
|
||||||
|
def add_op_consumer(self, src_op_name, src_slot, dst_op_name, dst_slot):
|
||||||
|
"""Add a consuming op for this op.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
src_op_name: Name of the op of which the output tensor is being consumed.
|
||||||
|
src_slot: 0-based output slot of the op being consumed.
|
||||||
|
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
|
||||||
|
dst_slot: 0-based input slot of the consuming op that receives the tensor
|
||||||
|
from this op.
|
||||||
|
"""
|
||||||
|
self._op_consumers[src_op_name].append((src_slot, dst_op_name, dst_slot))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def name(self):
|
def name(self):
|
||||||
return self._name
|
return self._name
|
||||||
@ -450,13 +464,33 @@ class DebuggedGraph(object):
|
|||||||
def inner_graph_ids(self):
|
def inner_graph_ids(self):
|
||||||
return self._inner_graph_ids
|
return self._inner_graph_ids
|
||||||
|
|
||||||
def get_op_type(self, op_name):
|
|
||||||
return self._op_by_name[op_name].op_type
|
|
||||||
|
|
||||||
def get_tensor_id(self, op_name, output_slot):
|
def get_tensor_id(self, op_name, output_slot):
|
||||||
"""Get the ID of a symbolic tensor in this graph."""
|
"""Get the ID of a symbolic tensor in this graph."""
|
||||||
return self._op_by_name[op_name].output_tensor_ids[output_slot]
|
return self._op_by_name[op_name].output_tensor_ids[output_slot]
|
||||||
|
|
||||||
|
def get_op_creation_digest(self, op_name):
|
||||||
|
"""Get the GraphOpCreationDigest for a op in the graph."""
|
||||||
|
return self._op_by_name[op_name]
|
||||||
|
|
||||||
|
def get_op_consumers(self, src_op_name):
|
||||||
|
"""Get all the downstream consumers of this op.
|
||||||
|
|
||||||
|
Only data (non-control) edges are tracked.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
src_op_name: Name of the op providing the tensor being consumed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of (src_slot, dst_op_name, dst_slot) tuples. In each item of
|
||||||
|
the list:
|
||||||
|
src_slot: 0-based output slot of the op of which the output tensor
|
||||||
|
is being consumed.
|
||||||
|
dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
|
||||||
|
dst_slot: 0-based input slot of the consuming op that receives
|
||||||
|
the tensor from this op.
|
||||||
|
"""
|
||||||
|
return self._op_consumers[src_op_name]
|
||||||
|
|
||||||
# TODO(cais): Implement to_json().
|
# TODO(cais): Implement to_json().
|
||||||
|
|
||||||
|
|
||||||
@ -500,6 +534,9 @@ class GraphOpCreationDigest(BaseDigest):
|
|||||||
output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
|
output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
|
||||||
input_names: Names of the input tensors to the op.
|
input_names: Names of the input tensors to the op.
|
||||||
device_name: The name of the device that the op is placed on (if available).
|
device_name: The name of the device that the op is placed on (if available).
|
||||||
|
host_name: Name of the host on which the op is created.
|
||||||
|
stack_frame_ids: IDs of the frames of the stack trace at which the op
|
||||||
|
is created.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
@ -509,6 +546,8 @@ class GraphOpCreationDigest(BaseDigest):
|
|||||||
op_type,
|
op_type,
|
||||||
op_name,
|
op_name,
|
||||||
output_tensor_ids,
|
output_tensor_ids,
|
||||||
|
host_name,
|
||||||
|
stack_frame_ids,
|
||||||
input_names=None,
|
input_names=None,
|
||||||
device_name=None):
|
device_name=None):
|
||||||
super(GraphOpCreationDigest, self).__init__(wall_time, offset)
|
super(GraphOpCreationDigest, self).__init__(wall_time, offset)
|
||||||
@ -516,6 +555,8 @@ class GraphOpCreationDigest(BaseDigest):
|
|||||||
self._op_type = op_type
|
self._op_type = op_type
|
||||||
self._op_name = op_name
|
self._op_name = op_name
|
||||||
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
|
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
|
||||||
|
self._host_name = host_name
|
||||||
|
self._stack_frame_ids = stack_frame_ids
|
||||||
self._input_names = _tuple_or_none(input_names)
|
self._input_names = _tuple_or_none(input_names)
|
||||||
self._device_name = device_name
|
self._device_name = device_name
|
||||||
|
|
||||||
@ -547,6 +588,14 @@ class GraphOpCreationDigest(BaseDigest):
|
|||||||
def device_name(self):
|
def device_name(self):
|
||||||
return self._device_name
|
return self._device_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def host_name(self):
|
||||||
|
return self._host_name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stack_frame_ids(self):
|
||||||
|
return self._stack_frame_ids
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
output = super(GraphOpCreationDigest, self).to_json()
|
output = super(GraphOpCreationDigest, self).to_json()
|
||||||
output.update({
|
output.update({
|
||||||
@ -554,6 +603,8 @@ class GraphOpCreationDigest(BaseDigest):
|
|||||||
"op_type": self.op_type,
|
"op_type": self.op_type,
|
||||||
"op_name": self.op_name,
|
"op_name": self.op_name,
|
||||||
"output_tensor_ids": self.output_tensor_ids,
|
"output_tensor_ids": self.output_tensor_ids,
|
||||||
|
"host_name": self.host_name,
|
||||||
|
"stack_frame_ids": self.stack_frame_ids,
|
||||||
"input_names": self.input_names,
|
"input_names": self.input_names,
|
||||||
"device_name": self.device_name,
|
"device_name": self.device_name,
|
||||||
})
|
})
|
||||||
@ -849,9 +900,17 @@ class DebugDataReader(object):
|
|||||||
op_creation_proto.op_type,
|
op_creation_proto.op_type,
|
||||||
op_creation_proto.op_name,
|
op_creation_proto.op_name,
|
||||||
tuple(op_creation_proto.output_tensor_ids),
|
tuple(op_creation_proto.output_tensor_ids),
|
||||||
|
op_creation_proto.code_location.host_name,
|
||||||
|
tuple(op_creation_proto.code_location.stack_frame_ids),
|
||||||
input_names=tuple(op_creation_proto.input_names))
|
input_names=tuple(op_creation_proto.input_names))
|
||||||
self._graph_op_digests.append(op_digest)
|
self._graph_op_digests.append(op_digest)
|
||||||
self._graph_by_id[op_creation_proto.graph_id].add_op(op_digest)
|
debugged_graph = self._graph_by_id[op_creation_proto.graph_id]
|
||||||
|
debugged_graph.add_op(op_digest)
|
||||||
|
for dst_slot, input_name in enumerate(op_creation_proto.input_names):
|
||||||
|
src_op_name, src_slot = input_name.split(":")
|
||||||
|
debugged_graph.add_op_consumer(src_op_name, int(src_slot),
|
||||||
|
op_creation_proto.op_name, dst_slot)
|
||||||
|
|
||||||
elif debug_event.debugged_graph.ByteSize():
|
elif debug_event.debugged_graph.ByteSize():
|
||||||
graph_proto = debug_event.debugged_graph
|
graph_proto = debug_event.debugged_graph
|
||||||
graph = DebuggedGraph(
|
graph = DebuggedGraph(
|
||||||
@ -936,7 +995,7 @@ class DebugDataReader(object):
|
|||||||
Returns:
|
Returns:
|
||||||
Op type as a str.
|
Op type as a str.
|
||||||
"""
|
"""
|
||||||
return self._graph_by_id[graph_id].get_op_type(op_name)
|
return self._graph_by_id[graph_id].get_op_creation_digest(op_name).op_type
|
||||||
|
|
||||||
def _load_execution(self):
|
def _load_execution(self):
|
||||||
"""Incrementally read the .execution file."""
|
"""Incrementally read the .execution file."""
|
||||||
@ -1136,13 +1195,10 @@ class DebugDataReader(object):
|
|||||||
1. The host name.
|
1. The host name.
|
||||||
2. The stack trace, as a list of (file_path, lineno, func) tuples.
|
2. The stack trace, as a list of (file_path, lineno, func) tuples.
|
||||||
"""
|
"""
|
||||||
debug_event = self._reader.read_graphs_event(
|
return graph_op_creation_digest.host_name, [
|
||||||
graph_op_creation_digest.offset)
|
|
||||||
graph_op_creation = debug_event.graph_op_creation
|
|
||||||
host_name = graph_op_creation.code_location.host_name
|
|
||||||
return host_name, [
|
|
||||||
self._stack_frame_by_id[frame_id][1:]
|
self._stack_frame_by_id[frame_id][1:]
|
||||||
for frame_id in graph_op_creation.code_location.stack_frame_ids]
|
for frame_id in graph_op_creation_digest.stack_frame_ids
|
||||||
|
]
|
||||||
|
|
||||||
# TODO(cais): Add graph_execution_digests() with an ExecutionDigest
|
# TODO(cais): Add graph_execution_digests() with an ExecutionDigest
|
||||||
# as a kwarg, to establish the association between top-level and intra-graph
|
# as a kwarg, to establish the association between top-level and intra-graph
|
||||||
|
@ -662,8 +662,14 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
|||||||
|
|
||||||
def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
|
def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
|
||||||
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
||||||
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
|
1234,
|
||||||
[135], input_names=None, device_name=None)
|
5678,
|
||||||
|
"deadbeef",
|
||||||
|
"FooOp",
|
||||||
|
"Model_1/Foo_2", [135],
|
||||||
|
"machine.cluster", ("a1", "a2"),
|
||||||
|
input_names=None,
|
||||||
|
device_name=None)
|
||||||
json = op_creation_digest.to_json()
|
json = op_creation_digest.to_json()
|
||||||
self.jsonRoundTripCheck(json)
|
self.jsonRoundTripCheck(json)
|
||||||
self.assertEqual(json["wall_time"], 1234)
|
self.assertEqual(json["wall_time"], 1234)
|
||||||
@ -671,13 +677,21 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
|||||||
self.assertEqual(json["op_type"], "FooOp")
|
self.assertEqual(json["op_type"], "FooOp")
|
||||||
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
||||||
self.assertEqual(json["output_tensor_ids"], (135,))
|
self.assertEqual(json["output_tensor_ids"], (135,))
|
||||||
|
self.assertEqual(json["host_name"], "machine.cluster")
|
||||||
|
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
|
||||||
self.assertIsNone(json["input_names"])
|
self.assertIsNone(json["input_names"])
|
||||||
self.assertIsNone(json["device_name"])
|
self.assertIsNone(json["device_name"])
|
||||||
|
|
||||||
def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
|
def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
|
||||||
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
|
||||||
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
|
1234,
|
||||||
[135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
|
5678,
|
||||||
|
"deadbeef",
|
||||||
|
"FooOp",
|
||||||
|
"Model_1/Foo_2", [135],
|
||||||
|
"machine.cluster", ("a1", "a2"),
|
||||||
|
input_names=["Bar_1", "Qux_2"],
|
||||||
|
device_name="/device:GPU:0")
|
||||||
json = op_creation_digest.to_json()
|
json = op_creation_digest.to_json()
|
||||||
self.jsonRoundTripCheck(json)
|
self.jsonRoundTripCheck(json)
|
||||||
self.assertEqual(json["wall_time"], 1234)
|
self.assertEqual(json["wall_time"], 1234)
|
||||||
@ -685,6 +699,8 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
|
|||||||
self.assertEqual(json["op_type"], "FooOp")
|
self.assertEqual(json["op_type"], "FooOp")
|
||||||
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
self.assertEqual(json["op_name"], "Model_1/Foo_2")
|
||||||
self.assertEqual(json["output_tensor_ids"], (135,))
|
self.assertEqual(json["output_tensor_ids"], (135,))
|
||||||
|
self.assertEqual(json["host_name"], "machine.cluster")
|
||||||
|
self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
|
||||||
self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
|
self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
|
||||||
self.assertEqual(json["device_name"], "/device:GPU:0")
|
self.assertEqual(json["device_name"], "/device:GPU:0")
|
||||||
|
|
||||||
|
@ -756,6 +756,63 @@ class DumpingCallbackTest(
|
|||||||
non_placeholder_full_tensor_values[3],
|
non_placeholder_full_tensor_values[3],
|
||||||
np.sin(np.log(5.0) + 1.0)) # Sin op.
|
np.sin(np.log(5.0) + 1.0)) # Sin op.
|
||||||
|
|
||||||
|
@parameterized.named_parameters(
|
||||||
|
("NoTensor", "NO_TENSOR"),
|
||||||
|
("FullTensor", "FULL_TENSOR"),
|
||||||
|
)
|
||||||
|
@test_util.run_in_graph_and_eager_modes
|
||||||
|
def testGraphOpConsumingRelationIsCaptured(self, tensor_debug_mode):
|
||||||
|
writer = dumping_callback.enable_dump_debug_info(
|
||||||
|
self.dump_root, tensor_debug_mode=tensor_debug_mode)
|
||||||
|
|
||||||
|
@def_function.function
|
||||||
|
def log_sum(x, y):
|
||||||
|
return math_ops.log(x + y)
|
||||||
|
|
||||||
|
@def_function.function
|
||||||
|
def maxindex_sin1p_log_sum(x, y):
|
||||||
|
_, indices = array_ops.unique(math_ops.sin(1.0 + log_sum(x, y)))
|
||||||
|
return math_ops.reduce_max(indices)
|
||||||
|
|
||||||
|
x = constant_op.constant([2.0, 2.0])
|
||||||
|
y = constant_op.constant([3.0, 3.0])
|
||||||
|
maxindex = maxindex_sin1p_log_sum(x, y)
|
||||||
|
self.assertAllEqual(maxindex, 0)
|
||||||
|
writer.FlushNonExecutionFiles()
|
||||||
|
writer.FlushExecutionFiles()
|
||||||
|
|
||||||
|
with debug_events_reader.DebugDataReader(self.dump_root) as reader:
|
||||||
|
reader.update()
|
||||||
|
traces = reader.graph_execution_traces()
|
||||||
|
add_traces = [trace for trace in traces if trace.op_type == "AddV2"]
|
||||||
|
log_traces = [trace for trace in traces if trace.op_type == "Log"]
|
||||||
|
sin_traces = [trace for trace in traces if trace.op_type == "Sin"]
|
||||||
|
unique_traces = [trace for trace in traces if trace.op_type == "Unique"]
|
||||||
|
max_traces = [trace for trace in traces if trace.op_type == "Max"]
|
||||||
|
self.assertLen(add_traces, 2)
|
||||||
|
self.assertLen(log_traces, 1)
|
||||||
|
self.assertLen(sin_traces, 1)
|
||||||
|
self.assertLen(unique_traces, 2) # The Unique op outputs two tensors.
|
||||||
|
self.assertLen(max_traces, 1)
|
||||||
|
graph = reader.graph_by_id(add_traces[0].graph_id)
|
||||||
|
# The first AddV2 op is consumed by the Log op.
|
||||||
|
self.assertEqual(
|
||||||
|
graph.get_op_consumers(add_traces[0].op_name),
|
||||||
|
[(0, log_traces[0].op_name, 0)])
|
||||||
|
graph = reader.graph_by_id(add_traces[1].graph_id)
|
||||||
|
# The second AddV2 op is consumed by the Sin op.
|
||||||
|
self.assertEqual(
|
||||||
|
graph.get_op_consumers(add_traces[1].op_name),
|
||||||
|
[(0, sin_traces[0].op_name, 0)])
|
||||||
|
# The last Sin op is consumed by the Unique op.
|
||||||
|
self.assertEqual(
|
||||||
|
graph.get_op_consumers(sin_traces[0].op_name),
|
||||||
|
[(0, unique_traces[0].op_name, 0)])
|
||||||
|
# The Unique op's 2nd output tensor is consumed by the Max op.
|
||||||
|
self.assertEqual(
|
||||||
|
graph.get_op_consumers(unique_traces[0].op_name),
|
||||||
|
[(1, max_traces[0].op_name, 0)])
|
||||||
|
|
||||||
def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
|
def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
|
||||||
"""Test correct executed IDs of two FuncGraphs from the same Py function."""
|
"""Test correct executed IDs of two FuncGraphs from the same Py function."""
|
||||||
writer = dumping_callback.enable_dump_debug_info(
|
writer = dumping_callback.enable_dump_debug_info(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user