[tfdbg2] Add FULL_HEALTH tensor debug mode support to InfNanMonitor

PiperOrigin-RevId: 299093247 Change-Id: I0f5e1f724e231cf3ba3ffb35c805b967f176ee7e
2020-03-05 06:57:40 -08:00 · 2020-03-05 06:57:40 -08:00 · 5536be153f
commit 5536be153f
parent 07c6281991
2 changed files with 60 additions and 6 deletions
--- a/tensorflow/python/debug/lib/debug_events_monitors.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors.py
@ -228,8 +228,23 @@ class InfNanMonitor(BaseMonitor):
            execution_index=execution_index,
            graph_execution_trace_index=graph_execution_trace_index))
    elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_HEALTH:
-      raise NotImplementedError(
-          "InfNanMonitor does not support FULL_HEALTH tensor-debug mode yet.")
+      (_, _, _, _, size, num_neg_inf, num_pos_inf, num_nan,
+       _, _, _) = debug_tensor_value
+      if num_neg_inf or num_pos_inf or num_nan:
+        self._alerts.append(InfNanAlert(
+            wall_time,
+            op_type,
+            output_slot,
+            size=size,
+            num_neg_inf=num_neg_inf,
+            num_pos_inf=num_pos_inf,
+            num_nan=num_nan,
+            execution_index=execution_index,
+            graph_execution_trace_index=graph_execution_trace_index))
+    else:
+      raise ValueError(
+          "Unsupported tensor debug mode: %s" %
+          debug_event_pb2.TensorDebugMode.Name(tensor_debug_mode))

  def on_execution(self,
                   execution_index,
--- a/tensorflow/python/debug/lib/debug_events_monitors_test.py
+++ b/tensorflow/python/debug/lib/debug_events_monitors_test.py
@ -66,6 +66,7 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
  @parameterized.named_parameters(
      ("NoTensor", "NO_TENSOR"),
      ("ConciseHealth", "CONCISE_HEALTH"),
+      ("FullHealth", "FULL_HEALTH"),
      ("FullTensor", "FULL_TENSOR"),
  )
  def testOnExecutionIsCalled(self, tensor_debug_mode):
@ -96,6 +97,12 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
        self.assertLen(execution.debug_tensor_values, 1)
        # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count].
        self.assertLen(execution.debug_tensor_values[0], 5)
+      elif tensor_debug_mode == "FULL_HEALTH":
+        self.assertLen(execution.debug_tensor_values, 1)
+        # [tensor_id, device_id, dtype, rank, element_count,
+        #  neg_inf_count, pos_inf_count, nan_count,
+        #  neg_finite_count, zero_count, pos_finite_count].
+        self.assertLen(execution.debug_tensor_values[0], 11)
      elif tensor_debug_mode == "FULL_TENSOR":
        # Full tensor values are not stored in the debug_tensor_values field.
        self.assertIsNone(execution.debug_tensor_values)
@ -104,6 +111,7 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,

  @parameterized.named_parameters(
      ("ConciseHealth", "CONCISE_HEALTH"),
+      ("FullHealth", "FULL_HEALTH"),
      ("FullTensor", "FULL_TENSOR"),
  )
  def testOnGraphExecutionTraceIsCalled(self, tensor_debug_mode):
@ -149,6 +157,21 @@ class DebugEventsMonitorTest(dumping_callback_test_lib.DumpingCallbackTestBase,
        self.assertLen(traces[0].debug_tensor_value, 5)
        self.assertLen(traces[1].debug_tensor_value, 5)
        self.assertLen(traces[2].debug_tensor_value, 5)
+      elif tensor_debug_mode == "FULL_HEALTH":
+        self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
+        self.assertEqual(traces[0].op_type, "Placeholder")
+        self.assertEqual(traces[0].output_slot, 0)
+        self.assertEqual(traces[1].op_type, "Unique")
+        self.assertEqual(traces[1].output_slot, 0)
+        # Unique:1 is not traced under FULL_HEALTH mode, as it's int-dtype.
+        self.assertEqual(traces[2].op_type, "Sum")
+        self.assertEqual(traces[2].output_slot, 0)
+        # [tensor_id, device_id, dtype, rank, element_count,
+        #  neg_inf_count, pos_inf_count, nan_count,
+        #  neg_finite_count, zero_count, pos_finite_count].
+        self.assertLen(traces[0].debug_tensor_value, 11)
+        self.assertLen(traces[1].debug_tensor_value, 11)
+        self.assertLen(traces[2].debug_tensor_value, 11)
      elif tensor_debug_mode == "FULL_TENSOR":
        self.assertLen(traces, 4)  # [Placeholder:0, Unique:0, Unique:1, Sum:0].
        self.assertEqual(traces[0].op_type, "Placeholder")
@ -236,21 +259,37 @@ class InfNanMonitorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
    self.assertEqual(alert.execution_index, 50)
    self.assertIsNone(alert.graph_execution_trace_index)

-  def testInfNanMonitorOnExecutionUnderConciseHealthMode(self):
+  @parameterized.named_parameters(
+      ("ConciseHealth",
+       debug_event_pb2.TensorDebugMode.CONCISE_HEALTH,
+       # [tensor_id, size, num_neg_inf, num_pos_inf, num_nan].
+       [[-1, 10, 1, 2, 3],
+        [-1, 100, 0, 0, 0]]),
+      ("FullHealth",
+       debug_event_pb2.TensorDebugMode.FULL_HEALTH,
+       # [tensor_id, device_id, dtype, rank, element_count,
+       #  neg_inf_count, pos_inf_count, nan_count,
+       #  neg_finite_count, zero_count, pos_finite_count].
+       [[-1, -1, 1, 1, 10, 1, 2, 3, 0, 0, 0],
+        [-1, -1, 1, 1, 100, 0, 0, 0, 10, 30, 60]]),
+  )
+  def testInfNanMonitorOnExecutionUnderHealthMode(self,
+                                                  tensor_debug_mode,
+                                                  debug_tensor_values):
    mock_reader = test.mock.MagicMock()
    monitor = debug_events_monitors.InfNanMonitor(mock_reader)
    execution_digest = debug_events_reader.ExecutionDigest(
        1234, 1, "BarOp", output_tensor_device_ids=[0, 1])
+
    execution = debug_events_reader.Execution(
        execution_digest,
        "worker01",
        ["a1", "b2", "e3"],
-        debug_event_pb2.TensorDebugMode.CONCISE_HEALTH,
+        tensor_debug_mode,
        graph_id=None,
        input_tensor_ids=[12, 34],
        output_tensor_ids=[56, 78],
-        # [tensor_id, size, num_neg_inf, num_pos_inf, num_nan].
-        debug_tensor_values=[[-1, 10, 1, 2, 3], [-1, 100, 0, 0, 0]])
+        debug_tensor_values=debug_tensor_values)
    monitor.on_execution(60, execution)

    self.assertLen(monitor.alerts(), 1)