[tfdbg2] Let DebugIdentityV2 op carry circular_buffer_size attribute
Motivation: - This CL addresses a bug in which the `circular_buffer_size` kwarg of the `tf.debugging.experimental.enable_dump_debug_info()` API works only on a local machine and doesn't behave as expected when a TF graph is execution on a remote TF server (e.g., a TPU worker). Technical aspect of the change: - Add an attribute to the DebugIdentityV2Op used by `enable_dump_debug_info()`: namely `circular_buffer_size`. This new attribute defaults to its previous effective default value (1000), and hence is backward compatible. - This new attribute helps propagate the value of `circular_buffer_size` setting from the host on which the instrumented graph is created to the host on which the graph is executed. PiperOrigin-RevId: 314761103 Change-Id: Ifbc898a1272d9498d6f856020f0b1145190da2e7
This commit is contained in:
parent
7e85bf98da
commit
171d688aaa
@ -428,13 +428,21 @@ class DebugIdentityV2Op : public OpKernel {
|
|||||||
OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
|
OP_REQUIRES_OK(context, context->GetAttr("output_slot", &output_slot_));
|
||||||
OP_REQUIRES_OK(context,
|
OP_REQUIRES_OK(context,
|
||||||
context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
|
context->GetAttr("tensor_debug_mode", &tensor_debug_mode_));
|
||||||
|
if (context->HasAttr("circular_buffer_size")) {
|
||||||
|
OP_REQUIRES_OK(context, context->GetAttr("circular_buffer_size",
|
||||||
|
&circular_buffer_size_));
|
||||||
|
} else {
|
||||||
|
circular_buffer_size_ =
|
||||||
|
tfdbg::DebugEventsWriter::kDefaultCyclicBufferSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Compute(OpKernelContext* context) override {
|
void Compute(OpKernelContext* context) override {
|
||||||
const Tensor& tensor = context->input(0);
|
const Tensor& tensor = context->input(0);
|
||||||
for (const string& dump_root : dump_roots_) {
|
for (const string& dump_root : dump_roots_) {
|
||||||
tfdbg::DebugEventsWriter* debug_events_writer =
|
tfdbg::DebugEventsWriter* debug_events_writer =
|
||||||
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
|
tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root,
|
||||||
|
circular_buffer_size_);
|
||||||
OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
|
OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
|
||||||
tfdbg_context_id_, device_name_, op_name_,
|
tfdbg_context_id_, device_name_, op_name_,
|
||||||
output_slot_, tensor_debug_mode_, tensor));
|
output_slot_, tensor_debug_mode_, tensor));
|
||||||
@ -449,6 +457,7 @@ class DebugIdentityV2Op : public OpKernel {
|
|||||||
string op_name_;
|
string op_name_;
|
||||||
int32 output_slot_;
|
int32 output_slot_;
|
||||||
int32 tensor_debug_mode_;
|
int32 tensor_debug_mode_;
|
||||||
|
int64 circular_buffer_size_;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||||
|
|||||||
@ -90,6 +90,7 @@ REGISTER_OP("DebugIdentityV2")
|
|||||||
.Attr("output_slot: int = -1")
|
.Attr("output_slot: int = -1")
|
||||||
.Attr("tensor_debug_mode: int = -1")
|
.Attr("tensor_debug_mode: int = -1")
|
||||||
.Attr("debug_urls: list(string) = []")
|
.Attr("debug_urls: list(string) = []")
|
||||||
|
.Attr("circular_buffer_size: int = 1000")
|
||||||
.SetIsStateful()
|
.SetIsStateful()
|
||||||
.SetShapeFn(shape_inference::UnchangedShape);
|
.SetShapeFn(shape_inference::UnchangedShape);
|
||||||
|
|
||||||
|
|||||||
@ -23,6 +23,7 @@ import os
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from tensorflow.core.protobuf import debug_event_pb2
|
from tensorflow.core.protobuf import debug_event_pb2
|
||||||
|
from tensorflow.python.compat import compat
|
||||||
from tensorflow.python.debug.lib import debug_events_reader
|
from tensorflow.python.debug.lib import debug_events_reader
|
||||||
from tensorflow.python.debug.lib import debug_events_writer
|
from tensorflow.python.debug.lib import debug_events_writer
|
||||||
from tensorflow.python.debug.lib import dumping_callback_test_lib
|
from tensorflow.python.debug.lib import dumping_callback_test_lib
|
||||||
@ -40,6 +41,12 @@ from tensorflow.python.platform import googletest
|
|||||||
|
|
||||||
|
|
||||||
class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||||
|
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is initialized.
|
||||||
|
|
||||||
|
DebugEventsWriter being initialized prior to DebugIdentityV2 ops being invoked
|
||||||
|
for the first time is the typical case (e.g., tfdbg2 running on a local
|
||||||
|
machine with only local devices.)
|
||||||
|
"""
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(DebugIdentityV2OpTest, self).setUp()
|
super(DebugIdentityV2OpTest, self).setUp()
|
||||||
@ -57,8 +64,6 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
|||||||
|
|
||||||
@def_function.function
|
@def_function.function
|
||||||
def write_debug_trace(x):
|
def write_debug_trace(x):
|
||||||
# DebugIdentityV2 is a stateful op. It ought to be included by auto
|
|
||||||
# control dependency.
|
|
||||||
square = math_ops.square(x)
|
square = math_ops.square(x)
|
||||||
gen_debug_ops.debug_identity_v2(
|
gen_debug_ops.debug_identity_v2(
|
||||||
square,
|
square,
|
||||||
@ -223,6 +228,64 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
|
|||||||
with self.assertRaises(StopIteration):
|
with self.assertRaises(StopIteration):
|
||||||
next(graph_trace_iter)
|
next(graph_trace_iter)
|
||||||
|
|
||||||
|
|
||||||
|
class DebugIdentityV2OpUninitializedWriterTest(
|
||||||
|
dumping_callback_test_lib.DumpingCallbackTestBase):
|
||||||
|
"""Tests for DebugIdentityV2Op: when DebugEventsWriter is not initialized.
|
||||||
|
|
||||||
|
This case can occur when DebugIdentityV2Ops are running on a remote
|
||||||
|
TensorFlow server (e.g., a TPU worker).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@test_util.run_in_graph_and_eager_modes
|
||||||
|
def testInvokingDebugIdentityV2OpBeforeCreatingDebugEventsWriterWorks(self):
|
||||||
|
if not compat.forward_compatible(2020, 6, 24):
|
||||||
|
self.skipTest("Functionality currently not supported.")
|
||||||
|
circular_buffer_size = 3
|
||||||
|
|
||||||
|
@def_function.function
|
||||||
|
def write_debug_trace(x):
|
||||||
|
# DebugIdentityV2 is a stateful op. It ought to be included by auto
|
||||||
|
# control dependency.
|
||||||
|
square = math_ops.square(x)
|
||||||
|
gen_debug_ops.debug_identity_v2(
|
||||||
|
square,
|
||||||
|
tfdbg_context_id="deadbeaf",
|
||||||
|
op_name="Square",
|
||||||
|
output_slot=0,
|
||||||
|
tensor_debug_mode=debug_event_pb2.TensorDebugMode.FULL_TENSOR,
|
||||||
|
debug_urls=["file://%s" % self.dump_root],
|
||||||
|
circular_buffer_size=circular_buffer_size)
|
||||||
|
return square
|
||||||
|
|
||||||
|
# The DebugIdentityV2 ops are invokes *before* a DebugEventsWriter at the
|
||||||
|
# same dump root is created.
|
||||||
|
for i in range(circular_buffer_size * 2):
|
||||||
|
self.assertAllClose(
|
||||||
|
write_debug_trace(np.array([i]).astype(np.float32)), [i**2.0])
|
||||||
|
writer = debug_events_writer.DebugEventsWriter(self.dump_root,
|
||||||
|
circular_buffer_size)
|
||||||
|
writer.FlushNonExecutionFiles()
|
||||||
|
writer.FlushExecutionFiles()
|
||||||
|
|
||||||
|
with debug_events_reader.DebugEventsReader(self.dump_root) as reader:
|
||||||
|
graph_trace_iter = reader.graph_execution_traces_iterator()
|
||||||
|
graph_execution_traces = []
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
graph_execution_traces.append(
|
||||||
|
next(graph_trace_iter).debug_event.graph_execution_trace)
|
||||||
|
except StopIteration:
|
||||||
|
break
|
||||||
|
self.assertLen(graph_execution_traces, circular_buffer_size)
|
||||||
|
for i in range(circular_buffer_size):
|
||||||
|
self.assertAllClose(
|
||||||
|
tensor_util.MakeNdarray(graph_execution_traces[i].tensor_proto),
|
||||||
|
[(i + circular_buffer_size)**2.0])
|
||||||
|
|
||||||
|
|
||||||
|
class DebugNumericSummaryV2Test(test_util.TensorFlowTestCase):
|
||||||
|
|
||||||
@test_util.run_in_graph_and_eager_modes
|
@test_util.run_in_graph_and_eager_modes
|
||||||
def testDebugNumericSummaryV2OpReduceInfNanThreeSlots(self):
|
def testDebugNumericSummaryV2OpReduceInfNanThreeSlots(self):
|
||||||
|
|
||||||
|
|||||||
@ -30,6 +30,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin
|
|||||||
from tensorflow.core.framework import tensor_pb2
|
from tensorflow.core.framework import tensor_pb2
|
||||||
from tensorflow.core.protobuf import debug_event_pb2
|
from tensorflow.core.protobuf import debug_event_pb2
|
||||||
from tensorflow.core.protobuf import graph_debug_info_pb2
|
from tensorflow.core.protobuf import graph_debug_info_pb2
|
||||||
|
from tensorflow.python.compat import compat as tf_compat
|
||||||
from tensorflow.python.debug.lib import debug_events_writer
|
from tensorflow.python.debug.lib import debug_events_writer
|
||||||
from tensorflow.python.debug.lib import op_callbacks_common
|
from tensorflow.python.debug.lib import op_callbacks_common
|
||||||
from tensorflow.python.debug.lib import source_utils
|
from tensorflow.python.debug.lib import source_utils
|
||||||
@ -366,17 +367,31 @@ class _DumpingCallback(object):
|
|||||||
with self._symbolic_tensor_counter_lock:
|
with self._symbolic_tensor_counter_lock:
|
||||||
debug_identity_name = ("DebugIdentityV2_%d" %
|
debug_identity_name = ("DebugIdentityV2_%d" %
|
||||||
self._symbolic_tensor_counter)
|
self._symbolic_tensor_counter)
|
||||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
if tf_compat.forward_compatible(2020, 6, 24):
|
||||||
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
# as a low-overhead placeholder, since no actual tensor value is
|
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
||||||
# traced.
|
# as a low-overhead placeholder, since no actual tensor value is
|
||||||
constant_op.constant([], dtype=dtypes.float32),
|
# traced.
|
||||||
tfdbg_context_id=tfdbg_context_id,
|
constant_op.constant([], dtype=dtypes.float32),
|
||||||
op_name=op_name,
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
output_slot=output_slot,
|
op_name=op_name,
|
||||||
tensor_debug_mode=self._tensor_debug_mode,
|
output_slot=output_slot,
|
||||||
debug_urls=debug_urls,
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
name=debug_identity_name)
|
debug_urls=debug_urls,
|
||||||
|
circular_buffer_size=self._circular_buffer_size,
|
||||||
|
name=debug_identity_name)
|
||||||
|
else:
|
||||||
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
|
# Use an empty (shape=[0]) float32 tensor for the NO_TENSOR mode
|
||||||
|
# as a low-overhead placeholder, since no actual tensor value is
|
||||||
|
# traced.
|
||||||
|
constant_op.constant([], dtype=dtypes.float32),
|
||||||
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
|
op_name=op_name,
|
||||||
|
output_slot=output_slot,
|
||||||
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
debug_urls=debug_urls,
|
||||||
|
name=debug_identity_name)
|
||||||
if is_v1_graph_mode:
|
if is_v1_graph_mode:
|
||||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||||
@ -400,17 +415,31 @@ class _DumpingCallback(object):
|
|||||||
if is_v1_graph_mode:
|
if is_v1_graph_mode:
|
||||||
instrumented_tensors.append(tensor)
|
instrumented_tensors.append(tensor)
|
||||||
continue
|
continue
|
||||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
if tf_compat.forward_compatible(2020, 6, 24):
|
||||||
gen_debug_ops.debug_numeric_summary_v2(
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
tensor,
|
gen_debug_ops.debug_numeric_summary_v2(
|
||||||
tensor_id=tensor_ids[output_slot],
|
tensor,
|
||||||
tensor_debug_mode=self._tensor_debug_mode,
|
tensor_id=tensor_ids[output_slot],
|
||||||
output_dtype=dtypes.float64),
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
tfdbg_context_id=tfdbg_context_id,
|
output_dtype=dtypes.float64),
|
||||||
op_name=op_name,
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
output_slot=output_slot,
|
op_name=op_name,
|
||||||
tensor_debug_mode=self._tensor_debug_mode,
|
output_slot=output_slot,
|
||||||
debug_urls=debug_urls)
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
debug_urls=debug_urls,
|
||||||
|
circular_buffer_size=self._circular_buffer_size)
|
||||||
|
else:
|
||||||
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
|
gen_debug_ops.debug_numeric_summary_v2(
|
||||||
|
tensor,
|
||||||
|
tensor_id=tensor_ids[output_slot],
|
||||||
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
output_dtype=dtypes.float64),
|
||||||
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
|
op_name=op_name,
|
||||||
|
output_slot=output_slot,
|
||||||
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
debug_urls=debug_urls)
|
||||||
if is_v1_graph_mode:
|
if is_v1_graph_mode:
|
||||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||||
@ -424,13 +453,23 @@ class _DumpingCallback(object):
|
|||||||
if is_v1_graph_mode:
|
if is_v1_graph_mode:
|
||||||
instrumented_tensors.append(tensor)
|
instrumented_tensors.append(tensor)
|
||||||
continue
|
continue
|
||||||
debug_tensor = gen_debug_ops.debug_identity_v2(
|
if tf_compat.forward_compatible(2020, 6, 24):
|
||||||
tensor,
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
tfdbg_context_id=tfdbg_context_id,
|
tensor,
|
||||||
op_name=op_name,
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
output_slot=output_slot,
|
op_name=op_name,
|
||||||
tensor_debug_mode=self._tensor_debug_mode,
|
output_slot=output_slot,
|
||||||
debug_urls=debug_urls)
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
debug_urls=debug_urls,
|
||||||
|
circular_buffer_size=self._circular_buffer_size)
|
||||||
|
else:
|
||||||
|
debug_tensor = gen_debug_ops.debug_identity_v2(
|
||||||
|
tensor,
|
||||||
|
tfdbg_context_id=tfdbg_context_id,
|
||||||
|
op_name=op_name,
|
||||||
|
output_slot=output_slot,
|
||||||
|
tensor_debug_mode=self._tensor_debug_mode,
|
||||||
|
debug_urls=debug_urls)
|
||||||
if is_v1_graph_mode:
|
if is_v1_graph_mode:
|
||||||
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
instrumented_tensors.append(self._process_v1_graph_mode_tensor(
|
||||||
op_type, tensor, debug_tensor, tensor_debug_mode))
|
op_type, tensor, debug_tensor, tensor_debug_mode))
|
||||||
|
|||||||
@ -982,7 +982,7 @@ tf_module {
|
|||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "DebugIdentityV2"
|
name: "DebugIdentityV2"
|
||||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
|
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
|
||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "DebugNanCount"
|
name: "DebugNanCount"
|
||||||
|
|||||||
@ -982,7 +982,7 @@ tf_module {
|
|||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "DebugIdentityV2"
|
name: "DebugIdentityV2"
|
||||||
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'None\'], "
|
argspec: "args=[\'input\', \'tfdbg_context_id\', \'op_name\', \'output_slot\', \'tensor_debug_mode\', \'debug_urls\', \'circular_buffer_size\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'-1\', \'-1\', \'[]\', \'1000\', \'None\'], "
|
||||||
}
|
}
|
||||||
member_method {
|
member_method {
|
||||||
name: "DebugNanCount"
|
name: "DebugNanCount"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user