[tfdbg2] Fix a bug in which InfNanMonitor does not handle SHAPE mode correctly

Description of bug:
- Introduced by CL/299093247
- The _check_debug_tensor_value() method is intended to check for infs and nans
  if and only if the debug_tensor_value carries data about inf and nan count
  (e.g., FULL_HEALTH and CONCISE_HEALTH). It should ignore tenosr_debug_modes
  that don't carry such information in debug_tensor_value (e.g., SHAPE).
  But currently it throws an error for such modes.

Fix:
- Remove the error-throwing if-else branch
- Add a unit test to cover the correct behavior
- Clarify the contract of _check_debug_tensor_value() method in its doc string.
PiperOrigin-RevId: 300369252
Change-Id: Ib9497ff2154716f524c243ed066cf87a02231786
This commit is contained in:
Shanqing Cai 2020-03-11 11:29:21 -07:00 committed by TensorFlower Gardener
parent 8d504919ca
commit ad78070af5
2 changed files with 47 additions and 5 deletions

View File

@ -200,7 +200,24 @@ class InfNanMonitor(BaseMonitor):
output_slot,
execution_index=None,
graph_execution_trace_index=None):
"""Check for bad numerical values based on debug summary of tensor value."""
"""Check for bad numerical values based on debug summary of tensor value.
If tensor_debug_mode is one in which debug_tensor_value does not carry
information about the presence or count of inf / nan values (e.g., SHAPE),
this method is a no-op.
When infs and/or nans are found, `InfNanAlert` objects are created and
appended to `self._alerts`.
Args:
tensor_debug_mode: TensorDebugMode proto enum.
debug_tensor_value: Debug tensor value as a list of numbers.
wall_time: Wall timestamp for the tensor event.
op_type: Type of the op that generated the tensor (e.g., "Conv2D").
output_slot: Output slot index of the tensor for the op.
execution_index: Top-level execution index.
graph_execution_trace_index: Intra-graph execution index.
"""
# FULL_TENSOR mode is handled by a separate code path.
assert tensor_debug_mode != debug_event_pb2.TensorDebugMode.FULL_TENSOR
if not debug_tensor_value:
@ -241,10 +258,6 @@ class InfNanMonitor(BaseMonitor):
num_nan=num_nan,
execution_index=execution_index,
graph_execution_trace_index=graph_execution_trace_index))
else:
raise ValueError(
"Unsupported tensor debug mode: %s" %
debug_event_pb2.TensorDebugMode.Name(tensor_debug_mode))
def on_execution(self,
execution_index,

View File

@ -304,6 +304,35 @@ class InfNanMonitorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
self.assertEqual(alert.execution_index, 60)
self.assertIsNone(alert.graph_execution_trace_index)
@parameterized.named_parameters(
("Shape",
debug_event_pb2.TensorDebugMode.SHAPE,
# [tensor_id, dtype, rank, element_cont, ...shape_truncate_6]
[[-1, 1, 2, 6, 3, 2, 0, 0, 0, 0],
[-1, 10, 1, 7, 7, 0, 0, 0, 0, 0]]),
)
def testInfNanMonitorOnExecutionUnderModeWithNoInfNanInfo(
self,
tensor_debug_mode,
debug_tensor_values):
mock_reader = test.mock.MagicMock()
monitor = debug_events_monitors.InfNanMonitor(mock_reader)
execution_digest = debug_events_reader.ExecutionDigest(
1234, 1, "BarOp", output_tensor_device_ids=[0, 1])
execution = debug_events_reader.Execution(
execution_digest,
"worker01",
["a1", "b2", "e3"],
tensor_debug_mode,
graph_id=None,
input_tensor_ids=[12, 34],
output_tensor_ids=[56, 78],
debug_tensor_values=debug_tensor_values)
monitor.on_execution(60, execution)
self.assertEmpty(monitor.alerts())
@parameterized.named_parameters(
("FloatsScalarWithInfAndNan", np.inf, np.float32, 1, 0, 1, 0),
("Floats2DWithInfAndNan", [[0, np.nan, np.nan, -np.inf]