[tfdbg2] Various improvements to DebugDataReader for DebuggerV2

This is related to https://github.com/tensorflow/tensorboard/pull/3564 1. Add DebuggedGraph.get_op_creation_digest() 2. Remove DebuggedGraph.get_op_type(), which is superseded by DebuggedGraph.get_op_creation_digest() and is not used anywhere. 3. Add DebuggedGraph.add_op_consumers() and DebuggedGraph.get_op_consumers() to enable efficient tracking of the downstream consuming ops of a graph op. 4. Add host_name and stack_frame_ids to data class GraphOpCreationDigest. PiperOrigin-RevId: 309455936 Change-Id: I104084c1ef8b887f69733702a2f4c3190fa5402f
2020-05-01 12:45:24 -07:00 · 2020-05-01 12:45:24 -07:00 · 958fbebe70
commit 958fbebe70
parent 6232075ca8
3 changed files with 144 additions and 15 deletions
--- a/tensorflow/python/debug/lib/debug_events_reader.py
+++ b/tensorflow/python/debug/lib/debug_events_reader.py
@ -410,6 +410,8 @@ class DebuggedGraph(object):
    self._inner_graph_ids = []
    # A dictionary from op name to GraphOpCreationDigest.
    self._op_by_name = dict()
+    # A dictionary mapping op to immediate downstream consumers.
+    self._op_consumers = collections.defaultdict(list)

  def add_inner_graph_id(self, inner_graph_id):
    """Add the debugger-generated ID of a graph nested within this graph.
@ -434,6 +436,18 @@ class DebuggedGraph(object):
    self._op_by_name[
        graph_op_creation_digest.op_name] = graph_op_creation_digest

+  def add_op_consumer(self, src_op_name, src_slot, dst_op_name, dst_slot):
+    """Add a consuming op for this op.
+
+    Args:
+      src_op_name: Name of the op of which the output tensor is being consumed.
+      src_slot: 0-based output slot of the op being consumed.
+      dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
+      dst_slot: 0-based input slot of the consuming op that receives the tensor
+        from this op.
+    """
+    self._op_consumers[src_op_name].append((src_slot, dst_op_name, dst_slot))
+
  @property
  def name(self):
    return self._name
@ -450,13 +464,33 @@ class DebuggedGraph(object):
  def inner_graph_ids(self):
    return self._inner_graph_ids

-  def get_op_type(self, op_name):
-    return self._op_by_name[op_name].op_type
-
  def get_tensor_id(self, op_name, output_slot):
    """Get the ID of a symbolic tensor in this graph."""
    return self._op_by_name[op_name].output_tensor_ids[output_slot]

+  def get_op_creation_digest(self, op_name):
+    """Get the GraphOpCreationDigest for a op in the graph."""
+    return self._op_by_name[op_name]
+
+  def get_op_consumers(self, src_op_name):
+    """Get all the downstream consumers of this op.
+
+    Only data (non-control) edges are tracked.
+
+    Args:
+      src_op_name: Name of the op providing the tensor being consumed.
+
+    Returns:
+      A list of (src_slot, dst_op_name, dst_slot) tuples. In each item of
+      the list:
+        src_slot: 0-based output slot of the op of which the output tensor
+          is being consumed.
+        dst_op_name: Name of the consuming op (e.g., "Conv2D_3/BiasAdd")
+        dst_slot: 0-based input slot of the consuming op that receives
+          the tensor from this op.
+    """
+    return self._op_consumers[src_op_name]
+
  # TODO(cais): Implement to_json().


@ -500,6 +534,9 @@ class GraphOpCreationDigest(BaseDigest):
    output_tensor_ids: Debugger-generated IDs for the output(s) of the op.
    input_names: Names of the input tensors to the op.
    device_name: The name of the device that the op is placed on (if available).
+    host_name: Name of the host on which the op is created.
+    stack_frame_ids: IDs of the frames of the stack trace at which the op
+      is created.
  """

  def __init__(self,
@ -509,6 +546,8 @@ class GraphOpCreationDigest(BaseDigest):
               op_type,
               op_name,
               output_tensor_ids,
+               host_name,
+               stack_frame_ids,
               input_names=None,
               device_name=None):
    super(GraphOpCreationDigest, self).__init__(wall_time, offset)
@ -516,6 +555,8 @@ class GraphOpCreationDigest(BaseDigest):
    self._op_type = op_type
    self._op_name = op_name
    self._output_tensor_ids = _tuple_or_none(output_tensor_ids)
+    self._host_name = host_name
+    self._stack_frame_ids = stack_frame_ids
    self._input_names = _tuple_or_none(input_names)
    self._device_name = device_name

@ -547,6 +588,14 @@ class GraphOpCreationDigest(BaseDigest):
  def device_name(self):
    return self._device_name

+  @property
+  def host_name(self):
+    return self._host_name
+
+  @property
+  def stack_frame_ids(self):
+    return self._stack_frame_ids
+
  def to_json(self):
    output = super(GraphOpCreationDigest, self).to_json()
    output.update({
@ -554,6 +603,8 @@ class GraphOpCreationDigest(BaseDigest):
        "op_type": self.op_type,
        "op_name": self.op_name,
        "output_tensor_ids": self.output_tensor_ids,
+        "host_name": self.host_name,
+        "stack_frame_ids": self.stack_frame_ids,
        "input_names": self.input_names,
        "device_name": self.device_name,
    })
@ -849,9 +900,17 @@ class DebugDataReader(object):
            op_creation_proto.op_type,
            op_creation_proto.op_name,
            tuple(op_creation_proto.output_tensor_ids),
+            op_creation_proto.code_location.host_name,
+            tuple(op_creation_proto.code_location.stack_frame_ids),
            input_names=tuple(op_creation_proto.input_names))
        self._graph_op_digests.append(op_digest)
-        self._graph_by_id[op_creation_proto.graph_id].add_op(op_digest)
+        debugged_graph = self._graph_by_id[op_creation_proto.graph_id]
+        debugged_graph.add_op(op_digest)
+        for dst_slot, input_name in enumerate(op_creation_proto.input_names):
+          src_op_name, src_slot = input_name.split(":")
+          debugged_graph.add_op_consumer(src_op_name, int(src_slot),
+                                         op_creation_proto.op_name, dst_slot)
+
      elif debug_event.debugged_graph.ByteSize():
        graph_proto = debug_event.debugged_graph
        graph = DebuggedGraph(
@ -936,7 +995,7 @@ class DebugDataReader(object):
    Returns:
      Op type as a str.
    """
-    return self._graph_by_id[graph_id].get_op_type(op_name)
+    return self._graph_by_id[graph_id].get_op_creation_digest(op_name).op_type

  def _load_execution(self):
    """Incrementally read the .execution file."""
@ -1136,13 +1195,10 @@ class DebugDataReader(object):
        1. The host name.
        2. The stack trace, as a list of (file_path, lineno, func) tuples.
    """
-    debug_event = self._reader.read_graphs_event(
-        graph_op_creation_digest.offset)
-    graph_op_creation = debug_event.graph_op_creation
-    host_name = graph_op_creation.code_location.host_name
-    return host_name, [
+    return graph_op_creation_digest.host_name, [
        self._stack_frame_by_id[frame_id][1:]
-        for frame_id in graph_op_creation.code_location.stack_frame_ids]
+        for frame_id in graph_op_creation_digest.stack_frame_ids
+    ]

  # TODO(cais): Add graph_execution_digests() with an ExecutionDigest
  #   as a kwarg, to establish the association between top-level and intra-graph
--- a/tensorflow/python/debug/lib/debug_events_writer_test.py
+++ b/tensorflow/python/debug/lib/debug_events_writer_test.py
@ -662,8 +662,14 @@ class DataObjectsTest(test_util.TensorFlowTestCase):

  def testGraphOpCreationDigestNoInputNoDeviceNameToJson(self):
    op_creation_digest = debug_events_reader.GraphOpCreationDigest(
-        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
-        [135], input_names=None, device_name=None)
+        1234,
+        5678,
+        "deadbeef",
+        "FooOp",
+        "Model_1/Foo_2", [135],
+        "machine.cluster", ("a1", "a2"),
+        input_names=None,
+        device_name=None)
    json = op_creation_digest.to_json()
    self.jsonRoundTripCheck(json)
    self.assertEqual(json["wall_time"], 1234)
@ -671,13 +677,21 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
    self.assertEqual(json["op_type"], "FooOp")
    self.assertEqual(json["op_name"], "Model_1/Foo_2")
    self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertEqual(json["host_name"], "machine.cluster")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
    self.assertIsNone(json["input_names"])
    self.assertIsNone(json["device_name"])

  def testGraphOpCreationDigestWithInputsAndDeviceNameToJson(self):
    op_creation_digest = debug_events_reader.GraphOpCreationDigest(
-        1234, 5678, "deadbeef", "FooOp", "Model_1/Foo_2",
-        [135], input_names=["Bar_1", "Qux_2"], device_name="/device:GPU:0")
+        1234,
+        5678,
+        "deadbeef",
+        "FooOp",
+        "Model_1/Foo_2", [135],
+        "machine.cluster", ("a1", "a2"),
+        input_names=["Bar_1", "Qux_2"],
+        device_name="/device:GPU:0")
    json = op_creation_digest.to_json()
    self.jsonRoundTripCheck(json)
    self.assertEqual(json["wall_time"], 1234)
@ -685,6 +699,8 @@ class DataObjectsTest(test_util.TensorFlowTestCase):
    self.assertEqual(json["op_type"], "FooOp")
    self.assertEqual(json["op_name"], "Model_1/Foo_2")
    self.assertEqual(json["output_tensor_ids"], (135,))
+    self.assertEqual(json["host_name"], "machine.cluster")
+    self.assertEqual(json["stack_frame_ids"], ("a1", "a2"))
    self.assertEqual(json["input_names"], ("Bar_1", "Qux_2"))
    self.assertEqual(json["device_name"], "/device:GPU:0")

--- a/tensorflow/python/debug/lib/dumping_callback_test.py
+++ b/tensorflow/python/debug/lib/dumping_callback_test.py
@ -756,6 +756,63 @@ class DumpingCallbackTest(
            non_placeholder_full_tensor_values[3],
            np.sin(np.log(5.0) + 1.0))  # Sin op.

+  @parameterized.named_parameters(
+      ("NoTensor", "NO_TENSOR"),
+      ("FullTensor", "FULL_TENSOR"),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def testGraphOpConsumingRelationIsCaptured(self, tensor_debug_mode):
+    writer = dumping_callback.enable_dump_debug_info(
+        self.dump_root, tensor_debug_mode=tensor_debug_mode)
+
+    @def_function.function
+    def log_sum(x, y):
+      return math_ops.log(x + y)
+
+    @def_function.function
+    def maxindex_sin1p_log_sum(x, y):
+      _, indices = array_ops.unique(math_ops.sin(1.0 + log_sum(x, y)))
+      return math_ops.reduce_max(indices)
+
+    x = constant_op.constant([2.0, 2.0])
+    y = constant_op.constant([3.0, 3.0])
+    maxindex = maxindex_sin1p_log_sum(x, y)
+    self.assertAllEqual(maxindex, 0)
+    writer.FlushNonExecutionFiles()
+    writer.FlushExecutionFiles()
+
+    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
+      reader.update()
+      traces = reader.graph_execution_traces()
+      add_traces = [trace for trace in traces if trace.op_type == "AddV2"]
+      log_traces = [trace for trace in traces if trace.op_type == "Log"]
+      sin_traces = [trace for trace in traces if trace.op_type == "Sin"]
+      unique_traces = [trace for trace in traces if trace.op_type == "Unique"]
+      max_traces = [trace for trace in traces if trace.op_type == "Max"]
+      self.assertLen(add_traces, 2)
+      self.assertLen(log_traces, 1)
+      self.assertLen(sin_traces, 1)
+      self.assertLen(unique_traces, 2)  # The Unique op outputs two tensors.
+      self.assertLen(max_traces, 1)
+      graph = reader.graph_by_id(add_traces[0].graph_id)
+      # The first AddV2 op is consumed by the Log op.
+      self.assertEqual(
+          graph.get_op_consumers(add_traces[0].op_name),
+          [(0, log_traces[0].op_name, 0)])
+      graph = reader.graph_by_id(add_traces[1].graph_id)
+      # The second AddV2 op is consumed by the Sin op.
+      self.assertEqual(
+          graph.get_op_consumers(add_traces[1].op_name),
+          [(0, sin_traces[0].op_name, 0)])
+      # The last Sin op is consumed by the Unique op.
+      self.assertEqual(
+          graph.get_op_consumers(sin_traces[0].op_name),
+          [(0, unique_traces[0].op_name, 0)])
+      # The Unique op's 2nd output tensor is consumed by the Max op.
+      self.assertEqual(
+          graph.get_op_consumers(unique_traces[0].op_name),
+          [(1, max_traces[0].op_name, 0)])
+
  def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self):
    """Test correct executed IDs of two FuncGraphs from the same Py function."""
    writer = dumping_callback.enable_dump_debug_info(