[tfdbg] Implement some to_json() methods & miscellaneous changes

Background: - Most of these additions are missing pieces of the `DebugDataReader` class discovered during the development of UIs (CLI and web GUI) of tfdbg2. - This CL implements the to_json() method of the following data classes, thereby resolving the related TODO items. - BaseDigest - ExecutionDigest - Execution - GraphOpCreationDigest - GraphExecutionTraceDigest - GraphExecutionTrace Other changes: - Add the `host_name` field to `Execution`. - Add method `source_file_list()` to `DebugDataReader` to support getting a list (tuple) of all source files involved in the execution of the program. - In the `debug_tensor_value` field of GraphExecutionTraceDigest, store `None` instead of an empty list when no data is available. - Add graph_id field to GraphExecutionTraceDigest. - Change `devices()` method to `DebugDataReader` to `device_name_map()`. This enables mapping `output_tensor_device_ids` in the ExecutoinDigest objects to actual device names. PiperOrigin-RevId: 289167538 Change-Id: Ie79c736e068974649281e3d1756aacabd0ce6345
2020-01-10 14:36:56 -08:00 · 2020-01-10 14:36:56 -08:00 · 8fce32ec67
commit 8fce32ec67
parent 459c5cb980
4 changed files with 277 additions and 30 deletions
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@ -197,7 +197,7 @@ class BaseDigest(object):
  """Base class for digest.

  Properties:
-    wall_time: A timestamp for the digest (unit: s).
+    wall_time: A timestamp for the digest as a `float` (unit: s).
    offset: A offset number in the corresponding file that can be used for
      fast random read access.
  """
@ -214,6 +214,9 @@ class BaseDigest(object):
  def offset(self):
    return self._offset

+  def to_json(self):
+    return {"wall_time": self.wall_time}
+

 class ExecutionDigest(BaseDigest):
  """Light-weight digest summarizing top-level execution event.
@ -238,7 +241,7 @@ class ExecutionDigest(BaseDigest):
               output_tensor_device_ids=None):
    super(ExecutionDigest, self).__init__(wall_time, offset)
    self._op_type = op_type
-    self._output_tensor_device_ids = output_tensor_device_ids
+    self._output_tensor_device_ids = _tuple_or_none(output_tensor_device_ids)

  @property
  def op_type(self):
@ -248,7 +251,17 @@ class ExecutionDigest(BaseDigest):
  def output_tensor_device_ids(self):
    return self._output_tensor_device_ids

-  # TODO(cais): Implement to_json().
+  def to_json(self):
+    output = super(ExecutionDigest, self).to_json()
+    output.update({
+        "op_type": self.op_type,
+        "output_tensor_device_ids": self.output_tensor_device_ids,
+    })
+    return output
+
+
+def _tuple_or_none(data):
+  return tuple(data) if data else None


 class Execution(ExecutionDigest):
@ -258,6 +271,7 @@ class Execution(ExecutionDigest):
  number of output tensors.

  Properties (beyond the base class `ExecutionDigest`):
+    host_name: Name of the host on which the execution happened.
    stack_frame_ids: Reference IDs for stack frames, ordered from bottommost to
      topmost. Use `DebugDataReader.read_execution_stack_trace()` to load the
      detailed stack frames (filepath, lineno and function name).
@ -277,6 +291,7 @@ class Execution(ExecutionDigest):

  def __init__(self,
               execution_digest,
+               host_name,
               stack_frame_ids,
               tensor_debug_mode,
               graph_id=None,
@ -288,12 +303,17 @@ class Execution(ExecutionDigest):
        execution_digest.offset,
        execution_digest.op_type,
        output_tensor_device_ids=execution_digest.output_tensor_device_ids)
-    self._stack_frame_ids = stack_frame_ids
+    self._host_name = host_name
+    self._stack_frame_ids = tuple(stack_frame_ids)
    self._tensor_debug_mode = tensor_debug_mode
    self._graph_id = graph_id
-    self._input_tensor_ids = input_tensor_ids
-    self._output_tensor_ids = output_tensor_ids
-    self._debug_tensor_values = debug_tensor_values
+    self._input_tensor_ids = _tuple_or_none(input_tensor_ids)
+    self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
+    self._debug_tensor_values = _tuple_or_none(debug_tensor_values)
+
+  @property
+  def host_name(self):
+    return self._host_name

  @property
  def stack_frame_ids(self):
@ -323,7 +343,18 @@ class Execution(ExecutionDigest):
  def debug_tensor_values(self):
    return self._debug_tensor_values

-  # TODO(cais): Implement to_json().
+  def to_json(self):
+    output = super(Execution, self).to_json()
+    output.update({
+        "host_name": self.host_name,
+        "stack_frame_ids": self.stack_frame_ids,
+        "tensor_debug_mode": self.tensor_debug_mode,
+        "graph_id": self.graph_id,
+        "input_tensor_ids": self.input_tensor_ids,
+        "output_tensor_ids": self.output_tensor_ids,
+        "debug_tensor_values": self.debug_tensor_values,
+    })
+    return output


 class DebuggedGraph(object):
@ -452,8 +483,8 @@ class GraphOpCreationDigest(BaseDigest):
    self._graph_id = graph_id
    self._op_type = op_type
    self._op_name = op_name
-    self._output_tensor_ids = output_tensor_ids
-    self._input_names = input_names
+    self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
+    self._input_names = _tuple_or_none(input_names)
    self._device_name = device_name

  @property
@ -484,7 +515,17 @@ class GraphOpCreationDigest(BaseDigest):
  def device_name(self):
    return self._device_name

-  # TODO(cais): Implement to_json().
+  def to_json(self):
+    output = super(GraphOpCreationDigest, self).to_json()
+    output.update({
+        "graph_id": self.graph_id,
+        "op_type": self.op_type,
+        "op_name": self.op_name,
+        "output_tensor_ids": self.output_tensor_ids,
+        "input_names": self.input_names,
+        "device_name": self.device_name,
+    })
+    return output


 class GraphExecutionTraceDigest(BaseDigest):
@ -497,6 +538,8 @@ class GraphExecutionTraceDigest(BaseDigest):
    op_type: Type name of the executed op (e.g., "Conv2D").
    op_name: Name of the op (e.g., "conv_2d_3/Conv2D").
    output_slot: Output slot index of the tensor.
+    graph_id: The debugger-generated ID of the innermost (immediately-enclosing)
+      graph.
  """

  def __init__(self,
@ -504,11 +547,13 @@ class GraphExecutionTraceDigest(BaseDigest):
               offset,
               op_type,
               op_name,
-               output_slot):
+               output_slot,
+               graph_id):
    super(GraphExecutionTraceDigest, self).__init__(wall_time, offset)
    self._op_type = op_type
    self._op_name = op_name
    self._output_slot = output_slot
+    self._graph_id = graph_id

  @property
  def op_type(self):
@ -522,7 +567,19 @@ class GraphExecutionTraceDigest(BaseDigest):
  def output_slot(self):
    return self._output_slot

-  # TODO(cais): Implement to_json().
+  @property
+  def graph_id(self):
+    return self._graph_id
+
+  def to_json(self):
+    output = super(GraphExecutionTraceDigest, self).to_json()
+    output.update({
+        "op_type": self.op_type,
+        "op_name": self.op_name,
+        "output_slot": self.output_slot,
+        "graph_id": self.graph_id,
+    })
+    return output


 class GraphExecutionTrace(GraphExecutionTraceDigest):
@ -551,8 +608,9 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):
        graph_execution_trace_digest.offset,
        graph_execution_trace_digest.op_type,
        graph_execution_trace_digest.op_name,
-        graph_execution_trace_digest.output_slot)
-    self._graph_ids = graph_ids
+        graph_execution_trace_digest.output_slot,
+        graph_execution_trace_digest.graph_id)
+    self._graph_ids = tuple(graph_ids)
    self._tensor_debug_mode = tensor_debug_mode
    self._debug_tensor_value = debug_tensor_value
    self._device_name = device_name
@ -571,13 +629,21 @@ class GraphExecutionTrace(GraphExecutionTraceDigest):

  @property
  def debug_tensor_value(self):
-    return self._debug_tensor_value
+    return _tuple_or_none(self._debug_tensor_value)

  @property
  def device_name(self):
    return self._device_name

-  # TODO(cais): Implement to_json().
+  def to_json(self):
+    output = super(GraphExecutionTrace, self).to_json()
+    output.update({
+        "graph_ids": self.graph_ids,
+        "tensor_debug_mode": self.tensor_debug_mode,
+        "debug_tensor_value": self.debug_tensor_value,
+        "device_name": self.device_name,
+    })
+    return output


 def _parse_tensor_value(tensor_proto, return_list=False):
@ -740,7 +806,8 @@ class DebugDataReader(object):
          offset,
          op_type,
          op_name,
-          trace_proto.output_slot)
+          trace_proto.output_slot,
+          debug_event.graph_execution_trace.tfdbg_context_id)
      self._graph_execution_trace_digests.append(digest)

  def _lookup_op_type(self, graph_id, op_name):
@ -774,6 +841,14 @@ class DebugDataReader(object):
    self._load_graph_execution_traces()
    self._load_execution()

+  def source_file_list(self):
+    """Get a list of source files known to the debugger data reader.
+
+    Returns:
+      A tuple of `(host_name, file_path)` tuples.
+    """
+    return tuple(self._host_name_file_path_to_offset.keys())
+
  def source_lines(self, host_name, file_path):
    """Read the line-by-line content of a source file.

@ -819,9 +894,10 @@ class DebugDataReader(object):
    """Get the name of a device by the debugger-generated ID of the device."""
    return self._device_by_id[device_id].device_name

-  def device_names(self):
-    """Get a set of all device names known to the debugger."""
-    return set(device.device_name for device in self._device_by_id.values())
+  def device_name_map(self):
+    """Get a map mapping device IDs to device names."""
+    return {device_id: self._device_by_id[device_id].device_name
+            for device_id in self._device_by_id}

  def graph_op_digests(self, op_type=None):
    """Get the list of the digests for graph-op creation so far.
@ -904,13 +980,13 @@ class DebugDataReader(object):
            _parse_tensor_value(tensor_proto, return_list=True))
    return Execution(
        execution_digest,
+        execution_proto.code_location.host_name,
        tuple(execution_proto.code_location.stack_frame_ids),
        execution_proto.tensor_debug_mode,
        graph_id=execution_proto.graph_id,
        input_tensor_ids=tuple(execution_proto.input_tensor_ids),
        output_tensor_ids=tuple(execution_proto.output_tensor_ids),
-        debug_tensor_values=tuple(
-            debug_tensor_values) if debug_tensor_values else None)
+        debug_tensor_values=_tuple_or_none(debug_tensor_values))

  def read_graph_execution_trace(self, graph_execution_trace_digest):
    """Read the detailed graph execution trace.
@ -955,9 +1031,8 @@ class DebugDataReader(object):
      execution: The Execution object of interest.

    Returns:
-      A tuple consisting of:
-        1. The host name.
-        2. The stack trace, as a list of (file_path, lineno, func) tuples.
+      1. The host name.
+      2. The stack trace, as a list of (file_path, lineno, func) tuples.
    """
    host_name = self._stack_frame_by_id[execution.stack_frame_ids[0]][0]
    return (host_name, [
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function

 import glob
+import json as json_lib
 import os
 import threading
 import time
@ -28,6 +29,7 @@ from tensorflow.python.debug.lib import debug_events_reader
 from tensorflow.python.debug.lib import debug_events_writer
 from tensorflow.python.debug.lib import dumping_callback_test_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.platform import googletest

@ -340,6 +342,160 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase):
      self.assertLen(op_names, len(set(op_names)))


+class DataObjectsTest(test_util.TensorFlowTestCase):
+
+  def jsonRoundTripCheck(self, obj):
+    self.assertEqual(
+        json_lib.dumps(json_lib.loads(json_lib.dumps(obj)), sort_keys=True),
+        json_lib.dumps(obj, sort_keys=True))
+
+  def testExecutionDigestWithNoOutputToJson(self):
+    execution_digest = debug_events_reader.ExecutionDigest(
+        1234, 5678, "FooOp", output_tensor_device_ids=None)
+    json = execution_digest.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["output_tensor_device_ids"], None)
+
+  def testExecutionDigestWithTwoOutputsToJson(self):
+    execution_digest = debug_events_reader.ExecutionDigest(
+        1234, 5678, "FooOp", output_tensor_device_ids=[1357, 2468])
+    json = execution_digest.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["output_tensor_device_ids"], (1357, 2468))
+
+  def testExecutionNoGraphNoInputToJson(self):
+    execution_digest = debug_events_reader.ExecutionDigest(
+        1234, 5678, "FooOp", output_tensor_device_ids=[1357])
+    execution = debug_events_reader.Execution(
+        execution_digest,
+        "localhost",
+        ("a1", "b2"),
+        debug_event_pb2.TensorDebugMode.CURT_HEALTH,
+        graph_id=None,
+        input_tensor_ids=None,
+        output_tensor_ids=[2468],
+        debug_tensor_values=([1, 0],))
+    json = execution.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["output_tensor_device_ids"], (1357,))
+    self.assertEqual(json["host_name"], "localhost")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "b2"))
+    self.assertEqual(json["tensor_debug_mode"],
+                     debug_event_pb2.TensorDebugMode.CURT_HEALTH)
+    self.assertIsNone(json["graph_id"])
+    self.assertIsNone(json["input_tensor_ids"])
+    self.assertEqual(json["output_tensor_ids"], (2468,))
+    self.assertEqual(json["debug_tensor_values"], ([1, 0],))
+
+  def testExecutionNoGraphNoInputButWithOutputToJson(self):
+    execution_digest = debug_events_reader.ExecutionDigest(
+        1234, 5678, "FooOp", output_tensor_device_ids=[1357])
+    execution = debug_events_reader.Execution(
+        execution_digest,
+        "localhost",
+        ("a1", "b2"),
+        debug_event_pb2.TensorDebugMode.FULL_HEALTH,
+        graph_id="abcd",
+        input_tensor_ids=[13, 37],
+        output_tensor_ids=None,
+        debug_tensor_values=None)
+    json = execution.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["output_tensor_device_ids"], (1357,))
+    self.assertEqual(json["host_name"], "localhost")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "b2"))
+    self.assertEqual(json["tensor_debug_mode"],
+                     debug_event_pb2.TensorDebugMode.FULL_HEALTH)
+    self.assertEqual(json["graph_id"], "abcd")
+    self.assertEqual(json["input_tensor_ids"], (13, 37))
+    self.assertIsNone(json["output_tensor_ids"])
+    self.assertIsNone(json["debug_tensor_values"])
+
+  def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
+    op_creation_digest = debug_events_reader.GraphOpCreationDigest(
+        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
+        [135], input_names=None, device_name=None)
+    json = op_creation_digest.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["graph_id"], "deadbeef")
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["op_name"], "Model_1/Foo_2")
+    self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertIsNone(json["input_names"])
+    self.assertIsNone(json["device_name"])
+
+  def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
+    op_creation_digest = debug_events_reader.GraphOpCreationDigest(
+        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
+        [135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
+    json = op_creation_digest.to_json()
+    self.jsonRoundTripCheck(json)
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["graph_id"], "deadbeef")
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["op_name"], "Model_1/Foo_2")
+    self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
+    self.assertEqual(json["device_name"], "/device:GPU:0")
+
+  def testGraphExecutionTraceDigestToJson(self):
+    trace_digest = debug_events_reader.GraphExecutionTraceDigest(
+        1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
+    json = trace_digest.to_json()
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["op_name"], "Model_1/Foo_2")
+    self.assertEqual(json["output_slot"], 1)
+    self.assertEqual(json["graph_id"], "deadbeef")
+
+  def testGraphExecutionTraceWithTensorDebugValueAndDeviceNameToJson(self):
+    trace_digest = debug_events_reader.GraphExecutionTraceDigest(
+        1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
+    trace = debug_events_reader.GraphExecutionTrace(
+        trace_digest, ["g1", "g2", "deadbeef"],
+        debug_event_pb2.TensorDebugMode.CURT_HEALTH,
+        debug_tensor_value=[3, 1], device_name="/device:GPU:0")
+    json = trace.to_json()
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["op_name"], "Model_1/Foo_2")
+    self.assertEqual(json["output_slot"], 1)
+    self.assertEqual(json["graph_id"], "deadbeef")
+    self.assertEqual(json["graph_ids"], ("g1", "g2", "deadbeef"))
+    self.assertEqual(json["tensor_debug_mode"],
+                     debug_event_pb2.TensorDebugMode.CURT_HEALTH)
+    self.assertEqual(json["debug_tensor_value"], (3, 1))
+    self.assertEqual(json["device_name"], "/device:GPU:0")
+
+  def testGraphExecutionTraceNoTensorDebugValueNoDeviceNameToJson(self):
+    trace_digest = debug_events_reader.GraphExecutionTraceDigest(
+        1234, 5678, "FooOp", "Model_1/Foo_2", 1, "deadbeef")
+    trace = debug_events_reader.GraphExecutionTrace(
+        trace_digest, ["g1", "g2", "deadbeef"],
+        debug_event_pb2.TensorDebugMode.NO_TENSOR,
+        debug_tensor_value=None, device_name=None)
+    json = trace.to_json()
+    self.assertEqual(json["wall_time"], 1234)
+    self.assertEqual(json["op_type"], "FooOp")
+    self.assertEqual(json["op_name"], "Model_1/Foo_2")
+    self.assertEqual(json["output_slot"], 1)
+    self.assertEqual(json["graph_id"], "deadbeef")
+    self.assertEqual(json["graph_ids"], ("g1", "g2", "deadbeef"))
+    self.assertEqual(json["tensor_debug_mode"],
+                     debug_event_pb2.TensorDebugMode.NO_TENSOR)
+    self.assertIsNone(json["debug_tensor_value"])
+    self.assertIsNone(json["device_name"])
+
+
 if __name__ == "__main__":
  ops.enable_eager_execution()
  googletest.main()
--- a/tensorflow/python/debug/lib/distributed_callbacks_test.py
+++ b/tensorflow/python/debug/lib/distributed_callbacks_test.py
@ -171,7 +171,7 @@ class DistributedDumpingCallbackTest(

      if tensor_debug_mode == "NO_TENSOR":
        for trace in traces:
-          self.assertEqual(trace.debug_tensor_value, [])
+          self.assertIsNone(trace.debug_tensor_value)
      elif tensor_debug_mode == "FULL_TENSOR":
        device_0_matmul_values = [
            reader.graph_execution_trace_to_tensor_value(trace)
@ -273,7 +273,7 @@ class DistributedDumpingCallbackTest(

      if tensor_debug_mode == "NO_TENSOR":
        for trace in traces:
-          self.assertEqual(trace.debug_tensor_value, [])
+          self.assertIsNone(trace.debug_tensor_value)
      elif tensor_debug_mode == "FULL_TENSOR":
        gpu_0_relu_values = [
            reader.graph_execution_trace_to_tensor_value(trace)
--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@ -342,6 +342,21 @@ class TracingCallbackTest(
        self.assertAllClose(
            trace.debug_tensor_value, [tensor_id, 10, 2, 4, 2, 2, 0, 0, 0, 0])

+  def testListingSourceFiles(self):
+    writer = dumping_callback.enable_dump_debug_info(self.dump_root)
+    # Run a simple eager execution event, so that the source files are dumped.
+    self.assertAllClose(math_ops.truediv(7.0, 1.0 / 6.0), 42.0)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      source_file_list = reader.source_file_list()
+      self.assertIsInstance(source_file_list, tuple)
+      for item in source_file_list:
+        self.assertIsInstance(item, tuple)
+        self.assertLen(item, 2)
+      self.assertIn((_host_name, _current_file_full_path), source_file_list)
+
  def testReadingSourceLines(self):
    writer = dumping_callback.enable_dump_debug_info(self.dump_root)
    # Run a simple eager execution event, so that the source-file contents are
@ -405,7 +420,8 @@ class TracingCallbackTest(
        self.assertEqual(
            reader.device_name_by_id(executions[0].output_tensor_device_ids[0]),
            self._expectedDefaultDeviceName())
-        self.assertIn(self._expectedDefaultDeviceName(), reader.device_names())
+        self.assertIn(self._expectedDefaultDeviceName(),
+                      set(reader.device_name_map().values()))

      # Verify the recorded graph-building history.
      add_op_digests = reader.graph_op_digests(op_type="AddV2")
@ -463,7 +479,7 @@ class TracingCallbackTest(
        # Under the default NO_TENSOR tensor-debug mode, the tensor_proto ought
        # to be an empty float32 tensor.
        for trace in graph_exec_traces:
-          self.assertEqual(trace.debug_tensor_value, [])
+          self.assertIsNone(trace.debug_tensor_value)
      elif tensor_debug_mode == "CURT_HEALTH":
        # Test the association between graph exec and prior graph building.
        # In each case, the 1st element of debug_tensor_value is the ID of the