[tfdbg2] Let DebugIdentityV2 op carry circular_buffer_size attribute

Motivation:
- This CL addresses a bug in which the `circular_buffer_size` kwarg
  of the `tf.debugging.experimental.enable_dump_debug_info()` API works
  only on a local machine and doesn't behave as expected when a TF graph
  is execution on a remote TF server (e.g., a TPU worker).

Technical aspect of the change:
- Add an attribute to the DebugIdentityV2Op used by `enable_dump_debug_info()`:
  namely `circular_buffer_size`. This new attribute defaults to its previous
  effective default value (1000), and hence is backward compatible.
- This new attribute helps propagate the value of `circular_buffer_size`
  setting from the host on which the instrumented graph is created to the
  host on which the graph is executed.

PiperOrigin-RevId: 314761103
Change-Id: Ifbc898a1272d9498d6f856020f0b1145190da2e7
This commit is contained in:
Shanqing Cai 2020-06-04 10:54:30 -07:00 committed by TensorFlower Gardener
parent 7e85bf98da
commit 171d688aaa
6 changed files with 146 additions and 34 deletions

View File

@ -428,13 +428,21 @@ class DebugIdentityV2Op : public OpKernel {
OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
OP_REQUIRES_OK(context,
context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
if (context->HasAttr("circular_buffer_size")) {
OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
&circular_buffer_size_));
} else {
circular_buffer_size_ =
tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
}
}
void Compute(OpKernelContext* context) override {
const Tensor& tensor = context->input(0);
for (const string& dump_root : dump_roots_) {
tfdbg::DebugEventsWriter* debug_events_writer =
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root,
circular_buffer_size_);
OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
tfdbg_context_id_, device_name_, op_name_,
output_slot_, tensor_debug_mode_, tensor));
@ -449,6 +457,7 @@ class DebugIdentityV2Op : public OpKernel {
string op_name_;
int32 output_slot_;
int32 tensor_debug_mode_;
int64 circular_buffer_size_;
};
typedef Eigen::ThreadPoolDevice CPUDevice;

View File

@ -90,6 +90,7 @@ REGISTER_OP("DebugIdentityV2")
.Attr("output_slot: int = -1")
.Attr("tensor_debug_mode: int = -1")
.Attr("debug_urls: list(string) = []")
.Attr("circular_buffer_size: int = 1000")
.SetIsStateful()
.SetShapeFn(shape_inference::UnchangedShape);

View File

@ -23,6 +23,7 @@ import os
import numpy as np
from tensorflow.core.protobuf import debug_event_pb2
from tensorflow.python.compat import compat
from tensorflow.python.debug.lib import debug_events_reader
from tensorflow.python.debug.lib import debug_events_writer
from tensorflow.python.debug.lib import dumping_callback_test_lib
@ -40,6 +41,12 @@ from tensorflow.python.platform import googletest
class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is initialized.
DebugEventsWriter being initialized prior to DebugIdentityV2 ops being invoked
for the first time is the typical case (e.g., tfdbg2 running on a local
machine with only local devices.)
"""
def setUp(self):
super(DebugIdentityV2OpTest, self).setUp()
@ -57,8 +64,6 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
@def_function.function
def write_debug_trace(x):
# DebugIdentityV2 is a stateful op. It ought to be included by auto
# control dependency.
square = math_ops.square(x)
gen_debug_ops.debug_identity_v2(
square,
@ -223,6 +228,64 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
with self.assertRaises(StopIteration):
next(graph_trace_iter)
class DebugIdentityV2OpUninitializedWriterTest(
dumping_callback_test_lib.DumpingCallbackTestBase):
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is not initialized.
This case can occur when DebugIdentityV2Ops are running on a remote
TensorFlow server (e.g., a TPU worker).
"""
@test_util.run_in_graph_and_eager_modes
def testInvokingDebugIdentityV2OpBeforeCreatingDebugEventsWriterWorks(self):
if not compat.forward_compatible(2020, 6, 24):
self.skipTest("Functionality currently not supported.")
circular_buffer_size = 3
@def_function.function
def write_debug_trace(x):
# DebugIdentityV2 is a stateful op. It ought to be included by auto
# control dependency.
square = math_ops.square(x)
gen_debug_ops.debug_identity_v2(
square,
tfdbg_context_id="deadbeaf",
op_name="Square",
output_slot=0,
tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR,
debug_urls=["file://%s" % self.dump_root],
circular_buffer_size=circular_buffer_size)
return square
# The DebugIdentityV2 ops are invokes *before* a DebugEventsWriter at the
# same dump root is created.
for i in range(circular_buffer_size * 2):
self.assertAllClose(
write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0])
writer = debug_events_writer.DebugEventsWriter(self.dump_root,
circular_buffer_size)
writer.FlushNonExecutionFiles()
writer.FlushExecutionFiles()
with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
graph_trace_iter = reader.graph_execution_traces_iterator()
graph_execution_traces = []
while True:
try:
graph_execution_traces.append(
next(graph_trace_iter).debug_event.graph_execution_trace)
except StopIteration:
break
self.assertLen(graph_execution_traces, circular_buffer_size)
for i in range(circular_buffer_size):
self.assertAllClose(
tensor_util.MakeNdarray(graph_execution_traces[i].tensor_proto),
[(i + circular_buffer_size)**2.0])
class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
@test_util.run_in_graph_and_eager_modes
def testDebugNumericSummaryV2OpReduceInfNanThreeSlots(self):

View File

@ -30,6 +30,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin
from tensorflow.core.framework import tensor_pb2
from tensorflow.core.protobuf import debug_event_pb2
from tensorflow.core.protobuf import graph_debug_info_pb2
from tensorflow.python.compat import compat as tf_compat
from tensorflow.python.debug.lib import debug_events_writer
from tensorflow.python.debug.lib import op_callbacks_common
from tensorflow.python.debug.lib import source_utils
@ -366,17 +367,31 @@ class _DumpingCallback(object):
with self._symbolic_tensor_counter_lock:
debug_identity_name = ("DebugIdentityV2_%d" %
self._symbolic_tensor_counter)
debug_tensor = gen_debug_ops.debug_identity_v2(
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
# as a low-overhead placeholder, since no actual tensor value is
# traced.
constant_op.constant([], dtype=dtypes.float32),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls,
name=debug_identity_name)
if tf_compat.forward_compatible(2020, 6, 24):
debug_tensor = gen_debug_ops.debug_identity_v2(
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
# as a low-overhead placeholder, since no actual tensor value is
# traced.
constant_op.constant([], dtype=dtypes.float32),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls,
circular_buffer_size=self._circular_buffer_size,
name=debug_identity_name)
else:
debug_tensor = gen_debug_ops.debug_identity_v2(
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
# as a low-overhead placeholder, since no actual tensor value is
# traced.
constant_op.constant([], dtype=dtypes.float32),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls,
name=debug_identity_name)
if is_v1_graph_mode:
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
op_type, tensor, debug_tensor, tensor_debug_mode))
@ -400,17 +415,31 @@ class _DumpingCallback(object):
if is_v1_graph_mode:
instrumented_tensors.append(tensor)
continue
debug_tensor = gen_debug_ops.debug_identity_v2(
gen_debug_ops.debug_numeric_summary_v2(
tensor,
tensor_id=tensor_ids[output_slot],
tensor_debug_mode=self._tensor_debug_mode,
output_dtype=dtypes.float64),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls)
if tf_compat.forward_compatible(2020, 6, 24):
debug_tensor = gen_debug_ops.debug_identity_v2(
gen_debug_ops.debug_numeric_summary_v2(
tensor,
tensor_id=tensor_ids[output_slot],
tensor_debug_mode=self._tensor_debug_mode,
output_dtype=dtypes.float64),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls,
circular_buffer_size=self._circular_buffer_size)
else:
debug_tensor = gen_debug_ops.debug_identity_v2(
gen_debug_ops.debug_numeric_summary_v2(
tensor,
tensor_id=tensor_ids[output_slot],
tensor_debug_mode=self._tensor_debug_mode,
output_dtype=dtypes.float64),
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls)
if is_v1_graph_mode:
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
op_type, tensor, debug_tensor, tensor_debug_mode))
@ -424,13 +453,23 @@ class _DumpingCallback(object):
if is_v1_graph_mode:
instrumented_tensors.append(tensor)
continue
debug_tensor = gen_debug_ops.debug_identity_v2(
tensor,
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls)
if tf_compat.forward_compatible(2020, 6, 24):
debug_tensor = gen_debug_ops.debug_identity_v2(
tensor,
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls,
circular_buffer_size=self._circular_buffer_size)
else:
debug_tensor = gen_debug_ops.debug_identity_v2(
tensor,
tfdbg_context_id=tfdbg_context_id,
op_name=op_name,
output_slot=output_slot,
tensor_debug_mode=self._tensor_debug_mode,
debug_urls=debug_urls)
if is_v1_graph_mode:
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
op_type, tensor, debug_tensor, tensor_debug_mode))

View File

@ -982,7 +982,7 @@ tf_module {
}
member_method {
name: "DebugIdentityV2"
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
}
member_method {
name: "DebugNanCount"

View File

@ -982,7 +982,7 @@ tf_module {
}
member_method {
name: "DebugIdentityV2"
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
}
member_method {
name: "DebugNanCount"