[tfdbg2] Let DebugIdentityV2 op carry circular_buffer_size attribute
Motivation: - This CL addresses a bug in which the `circular_buffer_size` kwarg of the `tf.debugging.experimental.enable_dump_debug_info()` API works only on a local machine and doesn't behave as expected when a TF graph is execution on a remote TF server (e.g., a TPU worker). Technical aspect of the change: - Add an attribute to the DebugIdentityV2Op used by `enable_dump_debug_info()`: namely `circular_buffer_size`. This new attribute defaults to its previous effective default value (1000), and hence is backward compatible. - This new attribute helps propagate the value of `circular_buffer_size` setting from the host on which the instrumented graph is created to the host on which the graph is executed. PiperOrigin-RevId: 314761103 Change-Id: Ifbc898a1272d9498d6f856020f0b1145190da2e7
This commit is contained in:
parent
7e85bf98da
commit
171d688aaa
@ -428,13 +428,21 @@ class DebugIdentityV2Op : public OpKernel {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
|
||||
OP_REQUIRES_OK(context,
|
||||
context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
|
||||
if (context->HasAttr("circular_buffer_size")) {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
|
||||
&circular_buffer_size_));
|
||||
} else {
|
||||
circular_buffer_size_ =
|
||||
tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
|
||||
}
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& tensor = context->input(0);
|
||||
for (const string& dump_root : dump_roots_) {
|
||||
tfdbg::DebugEventsWriter* debug_events_writer =
|
||||
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
|
||||
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root,
|
||||
circular_buffer_size_);
|
||||
OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
|
||||
tfdbg_context_id_, device_name_, op_name_,
|
||||
output_slot_, tensor_debug_mode_, tensor));
|
||||
@ -449,6 +457,7 @@ class DebugIdentityV2Op : public OpKernel {
|
||||
string op_name_;
|
||||
int32 output_slot_;
|
||||
int32 tensor_debug_mode_;
|
||||
int64 circular_buffer_size_;
|
||||
};
|
||||
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
|
||||
@ -90,6 +90,7 @@ REGISTER_OP("DebugIdentityV2")
|
||||
.Attr("output_slot: int = -1")
|
||||
.Attr("tensor_debug_mode: int = -1")
|
||||
.Attr("debug_urls: list(string) = []")
|
||||
.Attr("circular_buffer_size: int = 1000")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn(shape_inference::UnchangedShape);
|
||||
|
||||
|
||||
@ -23,6 +23,7 @@ import os
|
||||
import numpy as np
|
||||
|
||||
from tensorflow.core.protobuf import debug_event_pb2
|
||||
from tensorflow.python.compat import compat
|
||||
from tensorflow.python.debug.lib import debug_events_reader
|
||||
from tensorflow.python.debug.lib import debug_events_writer
|
||||
from tensorflow.python.debug.lib import dumping_callback_test_lib
|
||||
@ -40,6 +41,12 @@ from tensorflow.python.platform import googletest
|
||||
|
||||
|
||||
class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is initialized.
|
||||
|
||||
DebugEventsWriter being initialized prior to DebugIdentityV2 ops being invoked
|
||||
for the first time is the typical case (e.g., tfdbg2 running on a local
|
||||
machine with only local devices.)
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super(DebugIdentityV2OpTest, self).setUp()
|
||||
@ -57,8 +64,6 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||
|
||||
@def_function.function
|
||||
def write_debug_trace(x):
|
||||
# DebugIdentityV2 is a stateful op. It ought to be included by auto
|
||||
# control dependency.
|
||||
square = math_ops.square(x)
|
||||
gen_debug_ops.debug_identity_v2(
|
||||
square,
|
||||
@ -223,6 +228,64 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||
with self.assertRaises(StopIteration):
|
||||
next(graph_trace_iter)
|
||||
|
||||
|
||||
class DebugIdentityV2OpUninitializedWriterTest(
|
||||
dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is not initialized.
|
||||
|
||||
This case can occur when DebugIdentityV2Ops are running on a remote
|
||||
TensorFlow server (e.g., a TPU worker).
|
||||
"""
|
||||
|
||||
@test_util.run_in_graph_and_eager_modes
|
||||
def testInvokingDebugIdentityV2OpBeforeCreatingDebugEventsWriterWorks(self):
|
||||
if not compat.forward_compatible(2020, 6, 24):
|
||||
self.skipTest("Functionality currently not supported.")
|
||||
circular_buffer_size = 3
|
||||
|
||||
@def_function.function
|
||||
def write_debug_trace(x):
|
||||
# DebugIdentityV2 is a stateful op. It ought to be included by auto
|
||||
# control dependency.
|
||||
square = math_ops.square(x)
|
||||
gen_debug_ops.debug_identity_v2(
|
||||
square,
|
||||
tfdbg_context_id="deadbeaf",
|
||||
op_name="Square",
|
||||
output_slot=0,
|
||||
tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR,
|
||||
debug_urls=["file://%s" % self.dump_root],
|
||||
circular_buffer_size=circular_buffer_size)
|
||||
return square
|
||||
|
||||
# The DebugIdentityV2 ops are invokes *before* a DebugEventsWriter at the
|
||||
# same dump root is created.
|
||||
for i in range(circular_buffer_size * 2):
|
||||
self.assertAllClose(
|
||||
write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0])
|
||||
writer = debug_events_writer.DebugEventsWriter(self.dump_root,
|
||||
circular_buffer_size)
|
||||
writer.FlushNonExecutionFiles()
|
||||
writer.FlushExecutionFiles()
|
||||
|
||||
with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
|
||||
graph_trace_iter = reader.graph_execution_traces_iterator()
|
||||
graph_execution_traces = []
|
||||
while True:
|
||||
try:
|
||||
graph_execution_traces.append(
|
||||
next(graph_trace_iter).debug_event.graph_execution_trace)
|
||||
except StopIteration:
|
||||
break
|
||||
self.assertLen(graph_execution_traces, circular_buffer_size)
|
||||
for i in range(circular_buffer_size):
|
||||
self.assertAllClose(
|
||||
tensor_util.MakeNdarray(graph_execution_traces[i].tensor_proto),
|
||||
[(i + circular_buffer_size)**2.0])
|
||||
|
||||
|
||||
class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
|
||||
|
||||
@test_util.run_in_graph_and_eager_modes
|
||||
def testDebugNumericSummaryV2OpReduceInfNanThreeSlots(self):
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
from tensorflow.core.framework import tensor_pb2
|
||||
from tensorflow.core.protobuf import debug_event_pb2
|
||||
from tensorflow.core.protobuf import graph_debug_info_pb2
|
||||
from tensorflow.python.compat import compat as tf_compat
|
||||
from tensorflow.python.debug.lib import debug_events_writer
|
||||
from tensorflow.python.debug.lib import op_callbacks_common
|
||||
from tensorflow.python.debug.lib import source_utils
|
||||
@ -366,17 +367,31 @@ class _DumpingCallback(object):
|
||||
with self._symbolic_tensor_counter_lock:
|
||||
debug_identity_name = ("DebugIdentityV2_%d" %
|
||||
self._symbolic_tensor_counter)
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
||||
# as a low-overhead placeholder, since no actual tensor value is
|
||||
# traced.
|
||||
constant_op.constant([], dtype=dtypes.float32),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls,
|
||||
name=debug_identity_name)
|
||||
if tf_compat.forward_compatible(2020, 6, 24):
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
||||
# as a low-overhead placeholder, since no actual tensor value is
|
||||
# traced.
|
||||
constant_op.constant([], dtype=dtypes.float32),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls,
|
||||
circular_buffer_size=self._circular_buffer_size,
|
||||
name=debug_identity_name)
|
||||
else:
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
||||
# as a low-overhead placeholder, since no actual tensor value is
|
||||
# traced.
|
||||
constant_op.constant([], dtype=dtypes.float32),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls,
|
||||
name=debug_identity_name)
|
||||
if is_v1_graph_mode:
|
||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||
@ -400,17 +415,31 @@ class _DumpingCallback(object):
|
||||
if is_v1_graph_mode:
|
||||
instrumented_tensors.append(tensor)
|
||||
continue
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
gen_debug_ops.debug_numeric_summary_v2(
|
||||
tensor,
|
||||
tensor_id=tensor_ids[output_slot],
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
output_dtype=dtypes.float64),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls)
|
||||
if tf_compat.forward_compatible(2020, 6, 24):
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
gen_debug_ops.debug_numeric_summary_v2(
|
||||
tensor,
|
||||
tensor_id=tensor_ids[output_slot],
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
output_dtype=dtypes.float64),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls,
|
||||
circular_buffer_size=self._circular_buffer_size)
|
||||
else:
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
gen_debug_ops.debug_numeric_summary_v2(
|
||||
tensor,
|
||||
tensor_id=tensor_ids[output_slot],
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
output_dtype=dtypes.float64),
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls)
|
||||
if is_v1_graph_mode:
|
||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||
@ -424,13 +453,23 @@ class _DumpingCallback(object):
|
||||
if is_v1_graph_mode:
|
||||
instrumented_tensors.append(tensor)
|
||||
continue
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
tensor,
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls)
|
||||
if tf_compat.forward_compatible(2020, 6, 24):
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
tensor,
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls,
|
||||
circular_buffer_size=self._circular_buffer_size)
|
||||
else:
|
||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||
tensor,
|
||||
tfdbg_context_id=tfdbg_context_id,
|
||||
op_name=op_name,
|
||||
output_slot=output_slot,
|
||||
tensor_debug_mode=self._tensor_debug_mode,
|
||||
debug_urls=debug_urls)
|
||||
if is_v1_graph_mode:
|
||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||
|
||||
@ -982,7 +982,7 @@ tf_module {
|
||||
}
|
||||
member_method {
|
||||
name: "DebugIdentityV2"
|
||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
|
||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "DebugNanCount"
|
||||
|
||||
@ -982,7 +982,7 @@ tf_module {
|
||||
}
|
||||
member_method {
|
||||
name: "DebugIdentityV2"
|
||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
|
||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
|
||||
}
|
||||
member_method {
|
||||
name: "DebugNanCount"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user