[tfdbg] Implement some to_json() methods & miscellaneous changes

Background:
- Most of these additions are missing pieces of the `DebugDataReader`
  class discovered during the development of UIs (CLI and web GUI) of
  tfdbg2.

- This CL implements the to_json() method of the following data classes, thereby
  resolving the related TODO items.
  - BaseDigest
  - ExecutionDigest
  - Execution
  - GraphOpCreationDigest
  - GraphExecutionTraceDigest
  - GraphExecutionTrace

Other changes:
- Add the `host_name` field to `Execution`.
- Add method `source_file_list()` to `DebugDataReader` to support getting a list
  (tuple) of all source files involved in the execution of the program.
- In the `debug_tensor_value` field of GraphExecutionTraceDigest, store `None`
  instead of an empty list when no data is available.
- Add graph_id field to GraphExecutionTraceDigest.
- Change `devices()` method to `DebugDataReader` to `device_name_map()`. This
  enables mapping `output_tensor_device_ids` in the ExecutoinDigest objects to
  actual device names.

PiperOrigin-RevId: 289167538
Change-Id: Ie79c736e068974649281e3d1756aacabd0ce6345
This commit is contained in:
Shanqing Cai 2020-01-10 14:36:56 -08:00 committed by TensorFlower Gardener
parent 459c5cb980
commit 8fce32ec67
4 changed files with 277 additions and 30 deletions

View File

@ -197,7 +197,7 @@ class BaseDigest(object):
"""Base class for digest.
Properties:
wall_time: A timestamp for the digest (unit: s).
wall_time: A timestamp for the digest as a `float` (unit: s).
offset: A offset number in the corresponding file that can be used for
fast random read access.
"""
@ -214,6 +214,9 @@ class BaseDigest(object):
def offset(self):
return self._offset
def to_json(self):
return {"wall_time": self.wall_time}
class ExecutionDigest(BaseDigest):
"""Light-weight digest summarizing top-level execution event.
@ -238,7 +241,7 @@ class ExecutionDigest(BaseDigest):
output_tensor_device_ids=None):
super(ExecutionDigest, self).__init__(wall_time, offset)
self._op_type = op_type
self._output_tensor_device_ids = output_tensor_device_ids
self._output_tensor_device_ids = _tuple_or_none(output_tensor_device_ids)
@property
def op_type(self):
@ -248,7 +251,17 @@ class ExecutionDigest(BaseDigest):
def output_tensor_device_ids(self):
return self._output_tensor_device_ids
# TODO(cais): Implement to_json().
def to_json(self):
output = super(ExecutionDigest, self).to_json()
output.update({
"op_type": self.op_type,
"output_tensor_device_ids": self.output_tensor_device_ids,
})
return output
def _tuple_or_none(data):
return tuple(data) if data else None
class Execution(ExecutionDigest):
@ -258,6 +271,7 @@ class Execution(ExecutionDigest):
number of output tensors.
Properties (beyond the base class `ExecutionDigest`):
host_name: Name of the host on which the execution happened.
stack_frame_ids: Reference IDs for stack frames, ordered from bottommost to
topmost. Use `DebugDataReader.read_execution_stack_trace()` to load the
detailed stack frames (filepath, lineno and function name).
@ -277,6 +291,7 @@ class Execution(ExecutionDigest):
def __init__(self,
execution_digest,
host_name,
stack_frame_ids,
tensor_debug_mode,
graph_id=None,
@ -288,12 +303,17 @@ class Execution(ExecutionDigest):
execution_digest.offset,
execution_digest.op_type,
output_tensor_device_ids=execution_digest.output_tensor_device_ids)
self._stack_frame_ids = stack_frame_ids
self._host_name = host_name
self._stack_frame_ids = tuple(stack_frame_ids)
self._tensor_debug_mode = tensor_debug_mode
self._graph_id = graph_id
self._input_tensor_ids = input_tensor_ids
self._output_tensor_ids = output_tensor_ids
self._debug_tensor_values = debug_tensor_values
self._input_tensor_ids = _tuple_or_none(input_tensor_ids)
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
self._debug_tensor_values = _tuple_or_none(debug_tensor_values)
@property
def host_name(self):
return self._host_name
@property
def stack_frame_ids(self):
@ -323,7 +343,18 @@ class Execution(ExecutionDigest):
def debug_tensor_values(self):
return self._debug_tensor_values
# TODO(cais): Implement to_json().
def to_json(self):
output = super(Execution, self).to_json()
output.update({
"host_name": self.host_name,
"stack_frame_ids": self.stack_frame_ids,
"tensor_debug_mode": self.tensor_debug_mode,
"graph_id": self.graph_id,
"input_tensor_ids": self.input_tensor_ids,
"output_tensor_ids": self.output_tensor_ids,
"debug_tensor_values": self.debug_tensor_values,
})
return output
class DebuggedGraph(object):
@ -452,8 +483,8 @@ class GraphOpCreationDigest(BaseDigest):
self._graph_id = graph_id
self._op_type = op_type
self._op_name = op_name
self._output_tensor_ids = output_tensor_ids
self._input_names = input_names
self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
self._input_names = _tuple_or_none(input_names)
self._device_name = device_name
@property
@ -484,7 +515,17 @@ class GraphOpCreationDigest(BaseDigest):
def device_name(self):
return self._device_name
# TODO(cais): Implement to_json().
def to_json(self):
output = super(GraphOpCreationDigest, self).to_json()
output.update({
"graph_id": self.graph_id,
"op_type": self.op_type,
"op_name": self.op_name,
"output_tensor_ids": self.output_tensor_ids,
"input_names": self.input_names,
"device_name": self.device_name,
})
return output
class GraphExecutionTraceDigest(BaseDigest):
@ -497,6 +538,8 @@ class GraphExecutionTraceDigest(BaseDigest):
op_type: Type name of the executed op (e.g., "Conv2D").
op_name: Name of the op (e.g., "conv_2d_3/Conv2D").
output_slot: Output slot index of the tensor.
graph_id: The debugger-generated ID of the innermost (immediately-enclosing)
graph.
"""
def __init__(self,
@ -504,11 +547,13 @@ class GraphExecutionTraceDigest(BaseDigest):
offset,
op_type,
op_name,
output_slot):
output_slot,
graph_id):
super(GraphExecutionTraceDigest, self).__init__(wall_time, offset)
self._op_type = op_type
self._op_name = op_name
self._output_slot = output_slot
self._graph_id = graph_id
@property
def op_type(self):
@ -522,7 +567,19 @@ class GraphExecutionTraceDigest(BaseDigest):
def output_slot(self):
return self._output_slot
# TODO(cais): Implement to_json().
@property
def graph_id(self):
return self._graph_id
def to_json(self):
output = super(GraphExecutionTraceDigest, self).to_json()
output.update({
"op_type": self.op_type,
"op_name": self.op_name,
"output_slot": self.output_slot,
"graph_id": self.graph_id,
})
return output
class GraphExecutionTrace(GraphExecutionTraceDigest):
@ -551,8 +608,9 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):
graph_execution_trace_digest.offset,
graph_execution_trace_digest.op_type,
graph_execution_trace_digest.op_name,
graph_execution_trace_digest.output_slot)
self._graph_ids = graph_ids
graph_execution_trace_digest.output_slot,
graph_execution_trace_digest.graph_id)
self._graph_ids = tuple(graph_ids)
self._tensor_debug_mode = tensor_debug_mode
self._debug_tensor_value = debug_tensor_value
self._device_name = device_name
@ -571,13 +629,21 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):
@property
def debug_tensor_value(self):
return self._debug_tensor_value
return _tuple_or_none(self._debug_tensor_value)
@property
def device_name(self):
return self._device_name
# TODO(cais): Implement to_json().
def to_json(self):
output = super(GraphExecutionTrace, self).to_json()
output.update({
"graph_ids": self.graph_ids,
"tensor_debug_mode": self.tensor_debug_mode,
"debug_tensor_value": self.debug_tensor_value,
"device_name": self.device_name,
})
return output
def _parse_tensor_value(tensor_proto, return_list=False):
@ -740,7 +806,8 @@ class DebugDataReader(object):
offset,
op_type,
op_name,
trace_proto.output_slot)
trace_proto.output_slot,
debug_event.graph_execution_trace.tfdbg_context_id)
self._graph_execution_trace_digests.append(digest)
def _lookup_op_type(self, graph_id, op_name):
@ -774,6 +841,14 @@ class DebugDataReader(object):
self._load_graph_execution_traces()
self._load_execution()
def source_file_list(self):
"""Get a list of source files known to the debugger data reader.
Returns:
A tuple of `(host_name, file_path)` tuples.
"""
return tuple(self._host_name_file_path_to_offset.keys())
def source_lines(self, host_name, file_path):
"""Read the line-by-line content of a source file.
@ -819,9 +894,10 @@ class DebugDataReader(object):
"""Get the name of a device by the debugger-generated ID of the device."""
return self._device_by_id[device_id].device_name
def device_names(self):
"""Get a set of all device names known to the debugger."""
return set(device.device_name for device in self._device_by_id.values())
def device_name_map(self):
"""Get a map mapping device IDs to device names."""
return {device_id: self._device_by_id[device_id].device_name
for device_id in self._device_by_id}
def graph_op_digests(self, op_type=None):
"""Get the list of the digests for graph-op creation so far.
@ -904,13 +980,13 @@ class DebugDataReader(object):
_parse_tensor_value(tensor_proto, return_list=True))
return Execution(
execution_digest,
execution_proto.code_location.host_name,
tuple(execution_proto.code_location.stack_frame_ids),
execution_proto.tensor_debug_mode,
graph_id=execution_proto.graph_id,
input_tensor_ids=tuple(execution_proto.input_tensor_ids),
output_tensor_ids=tuple(execution_proto.output_tensor_ids),
debug_tensor_values=tuple(
debug_tensor_values) if debug_tensor_values else None)
debug_tensor_values=_tuple_or_none(debug_tensor_values))
def read_graph_execution_trace(self, graph_execution_trace_digest):
"""Read the detailed graph execution trace.
@ -955,9 +1031,8 @@ class DebugDataReader(object):
execution: The Execution object of interest.
Returns:
A tuple consisting of:
1. The host name.
2. The stack trace, as a list of (file_path, lineno, func) tuples.
1. The host name.
2. The stack trace, as a list of (file_path, lineno, func) tuples.
"""
host_name = self._stack_frame_by_id[execution.stack_frame_ids[0]][0]
return (host_name, [

View File

@ -19,6 +19,7 @@ from __future__ import division
from __future__ import print_function
import glob
import json as json_lib
import os
import threading
import time
@ -28,6 +29,7 @@ from tensorflow.python.debug.lib import debug_events_reader
from tensorflow.python.debug.lib import debug_events_writer
from tensorflow.python.debug.lib import dumping_callback_test_lib
from tensorflow.python.framework import ops
from tensorflow.python.framework import test_util
from tensorflow.python.framework import versions
from tensorflow.python.platform import googletest
@ -340,6 +342,160 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
self.assertLen(op_names, len(set(op_names)))
class DataObjectsTest(test_util.TensorFlowTestCase):
def jsonRoundTripCheck(self, obj):
self.assertEqual(
json_lib.dumps(json_lib.loads(json_lib.dumps(obj)), sort_keys=True),
json_lib.dumps(obj, sort_keys=True))
def testExecutionDigestWithNoOutputToJson(self):
execution_digest = debug_events_reader.ExecutionDigest(
1234, 5678, "FooOp", output_tensor_device_ids=None)
json = execution_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["output_tensor_device_ids"], None)
def testExecutionDigestWithTwoOutputsToJson(self):
execution_digest = debug_events_reader.ExecutionDigest(
1234, 5678, "FooOp", output_tensor_device_ids=[1357, 2468])
json = execution_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["output_tensor_device_ids"], (1357, 2468))
def testExecutionNoGraphNoInputToJson(self):
execution_digest = debug_events_reader.ExecutionDigest(
1234, 5678, "FooOp", output_tensor_device_ids=[1357])
execution = debug_events_reader.Execution(
execution_digest,
"localhost",
("a1", "b2"),
debug_event_pb2.TensorDebugMode.CURT_HEALTH,
graph_id=None,
input_tensor_ids=None,
output_tensor_ids=[2468],
debug_tensor_values=([1, 0],))
json = execution.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["output_tensor_device_ids"], (1357,))
self.assertEqual(json["host_name"], "localhost")
self.assertEqual(json["stack_frame_ids"], ("a1", "b2"))
self.assertEqual(json["tensor_debug_mode"],
debug_event_pb2.TensorDebugMode.CURT_HEALTH)
self.assertIsNone(json["graph_id"])
self.assertIsNone(json["input_tensor_ids"])
self.assertEqual(json["output_tensor_ids"], (2468,))
self.assertEqual(json["debug_tensor_values"], ([1, 0],))
def testExecutionNoGraphNoInputButWithOutputToJson(self):
execution_digest = debug_events_reader.ExecutionDigest(
1234, 5678, "FooOp", output_tensor_device_ids=[1357])
execution = debug_events_reader.Execution(
execution_digest,
"localhost",
("a1", "b2"),
debug_event_pb2.TensorDebugMode.FULL_HEALTH,
graph_id="abcd",
input_tensor_ids=[13, 37],
output_tensor_ids=None,
debug_tensor_values=None)
json = execution.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["output_tensor_device_ids"], (1357,))
self.assertEqual(json["host_name"], "localhost")
self.assertEqual(json["stack_frame_ids"], ("a1", "b2"))
self.assertEqual(json["tensor_debug_mode"],
debug_event_pb2.TensorDebugMode.FULL_HEALTH)
self.assertEqual(json["graph_id"], "abcd")
self.assertEqual(json["input_tensor_ids"], (13, 37))
self.assertIsNone(json["output_tensor_ids"])
self.assertIsNone(json["debug_tensor_values"])
def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
[135], input_names=None, device_name=None)
json = op_creation_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["graph_id"], "deadbeef")
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_tensor_ids"], (135,))
self.assertIsNone(json["input_names"])
self.assertIsNone(json["device_name"])
def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
op_creation_digest = debug_events_reader.GraphOpCreationDigest(
1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
[135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
json = op_creation_digest.to_json()
self.jsonRoundTripCheck(json)
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["graph_id"], "deadbeef")
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_tensor_ids"], (135,))
self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
self.assertEqual(json["device_name"], "/device:GPU:0")
def testGraphExecutionTraceDigestToJson(self):
trace_digest = debug_events_reader.GraphExecutionTraceDigest(
1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
json = trace_digest.to_json()
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_slot"], 1)
self.assertEqual(json["graph_id"], "deadbeef")
def testGraphExecutionTraceWithTensorDebugValueAndDeviceNameToJson(self):
trace_digest = debug_events_reader.GraphExecutionTraceDigest(
1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
trace = debug_events_reader.GraphExecutionTrace(
trace_digest, ["g1", "g2", "deadbeef"],
debug_event_pb2.TensorDebugMode.CURT_HEALTH,
debug_tensor_value=[3, 1], device_name="/device:GPU:0")
json = trace.to_json()
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_slot"], 1)
self.assertEqual(json["graph_id"], "deadbeef")
self.assertEqual(json["graph_ids"], ("g1", "g2", "deadbeef"))
self.assertEqual(json["tensor_debug_mode"],
debug_event_pb2.TensorDebugMode.CURT_HEALTH)
self.assertEqual(json["debug_tensor_value"], (3, 1))
self.assertEqual(json["device_name"], "/device:GPU:0")
def testGraphExecutionTraceNoTensorDebugValueNoDeviceNameToJson(self):
trace_digest = debug_events_reader.GraphExecutionTraceDigest(
1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
trace = debug_events_reader.GraphExecutionTrace(
trace_digest, ["g1", "g2", "deadbeef"],
debug_event_pb2.TensorDebugMode.NO_TENSOR,
debug_tensor_value=None, device_name=None)
json = trace.to_json()
self.assertEqual(json["wall_time"], 1234)
self.assertEqual(json["op_type"], "FooOp")
self.assertEqual(json["op_name"], "Model_1/Foo_2")
self.assertEqual(json["output_slot"], 1)
self.assertEqual(json["graph_id"], "deadbeef")
self.assertEqual(json["graph_ids"], ("g1", "g2", "deadbeef"))
self.assertEqual(json["tensor_debug_mode"],
debug_event_pb2.TensorDebugMode.NO_TENSOR)
self.assertIsNone(json["debug_tensor_value"])
self.assertIsNone(json["device_name"])
if __name__ == "__main__":
ops.enable_eager_execution()
googletest.main()

View File

@ -171,7 +171,7 @@ class DistributedDumpingCallbackTest(
if tensor_debug_mode == "NO_TENSOR":
for trace in traces:
self.assertEqual(trace.debug_tensor_value, [])
self.assertIsNone(trace.debug_tensor_value)
elif tensor_debug_mode == "FULL_TENSOR":
device_0_matmul_values = [
reader.graph_execution_trace_to_tensor_value(trace)
@ -273,7 +273,7 @@ class DistributedDumpingCallbackTest(
if tensor_debug_mode == "NO_TENSOR":
for trace in traces:
self.assertEqual(trace.debug_tensor_value, [])
self.assertIsNone(trace.debug_tensor_value)
elif tensor_debug_mode == "FULL_TENSOR":
gpu_0_relu_values = [
reader.graph_execution_trace_to_tensor_value(trace)

View File

@ -342,6 +342,21 @@ class TracingCallbackTest(
self.assertAllClose(
trace.debug_tensor_value, [tensor_id, 10, 2, 4, 2, 2, 0, 0, 0, 0])
def testListingSourceFiles(self):
writer = dumping_callback.enable_dump_debug_info(self.dump_root)
# Run a simple eager execution event, so that the source files are dumped.
self.assertAllClose(math_ops.truediv(7.0, 1.0 / 6.0), 42.0)
writer.FlushNonExecutionFiles()
writer.FlushExecutionFiles()
with debug_events_reader.DebugDataReader(self.dump_root) as reader:
reader.update()
source_file_list = reader.source_file_list()
self.assertIsInstance(source_file_list, tuple)
for item in source_file_list:
self.assertIsInstance(item, tuple)
self.assertLen(item, 2)
self.assertIn((_host_name, _current_file_full_path), source_file_list)
def testReadingSourceLines(self):
writer = dumping_callback.enable_dump_debug_info(self.dump_root)
# Run a simple eager execution event, so that the source-file contents are
@ -405,7 +420,8 @@ class TracingCallbackTest(
self.assertEqual(
reader.device_name_by_id(executions[0].output_tensor_device_ids[0]),
self._expectedDefaultDeviceName())
self.assertIn(self._expectedDefaultDeviceName(), reader.device_names())
self.assertIn(self._expectedDefaultDeviceName(),
set(reader.device_name_map().values()))
# Verify the recorded graph-building history.
add_op_digests = reader.graph_op_digests(op_type="AddV2")
@ -463,7 +479,7 @@ class TracingCallbackTest(
# Under the default NO_TENSOR tensor-debug mode, the tensor_proto ought
# to be an empty float32 tensor.
for trace in graph_exec_traces:
self.assertEqual(trace.debug_tensor_value, [])
self.assertIsNone(trace.debug_tensor_value)
elif tensor_debug_mode == "CURT_HEALTH":
# Test the association between graph exec and prior graph building.
# In each case, the 1st element of debug_tensor_value is the ID of the