Removing some unused options from tensor tracer.

PiperOrigin-RevId: 333619299
Change-Id: Ib9303bf9f134f941a33636561ba266c34504fb3a
This commit is contained in:
Mehmet Deveci 2020-09-24 16:08:47 -07:00 committed by TensorFlower Gardener
parent 34231a62a6
commit fbe4f6e8a4
3 changed files with 29 additions and 191 deletions

View File

@ -145,11 +145,7 @@ def set_parameters(tensor_tracer_params=None):
- full_tensor_summary: Writes the full tensors as binary event files.
The outputs can be read using: trace =
tensor_tracer.read_tensor_tracer_event_file(event_file_path)
- trace-back-if-nan: This mode will write the full tensor content only
when the tensor has a NaN or Inf in it. It is possible to also print
the inputs coming to this op using 'trace_stack_size' parameter.
E.g., if trace_stack_size=2, then the tensor with NaN/Inf, its
inputs, and its inputs' inputs will also be printed.
- report_file: Path to the metadata file that is written during graph
construction. If not set, metadata will be printed to stdout during
graph construction.
@ -181,32 +177,14 @@ def set_parameters(tensor_tracer_params=None):
'--included_optypes=some_op_type --excluded_optypes=*.' will trace
only the ops with type 'some_op_type'
Advanced Flags:
- compact_trace: If not set, statistics per tensor is written as soon as
they are executed. If set, then statistics for all traced tensors will
be stored in a cache and will be written only once per step. This flag
is ignored for full-tensor and part-tensor trace modes. If the
trace_dir is a remote directory, compact_trace will be forced.
- trace_scalar: Scalar values are not traced by default. If this flag is
set, scalar values will also be traced.
- included_cores: Accepts a list string. Tracing will only be dumped for
these cores. E.g, setting it to '[0,2,4,6]' will result in a trace
only for those cores.
- op_range: In the form of '%d:%d' that limits the tracing to the ops
within this limit. --op_range='5:10' will trace only the ops that have
topological order between 5-10.
- trace_before_included_ops: If set to a number-k, it will also trace
distance-k inputs of each traced tensor. E.g., k=1, then in addition
to each traced_tensor, their input tensors will also be traced.
- trace_after_included_ops: Same as trace_before_included_ops, where it
will also trace distance-k outputs of each traced tensor.
- submode: 'brief' or 'detailed'. If the trace mode is not compact,
brief mode will print only the id of each traced tensor to save some
space. 'detailed' mode prints the full tensor name.
- trace_stack_size: Used only for trace_mode=trace-back-if-nan mode. It
determines how many ops to print back from a nan op. E.g, op4 -> op3
-> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
result of op1 will also be printed. trace_stack_size is 2, the result
of op1 and op2 will be printed.
- use_fingerprint_subdirectory: The trace directory will be chosen as
using the fingerprint of the trace metadata under the provided
trace_dir.
@ -527,9 +505,6 @@ class TensorTracer(object):
def _is_interesting_op(self, op):
"""Returns True if the given op is not an interesting one to be traced."""
# If flag is set to include less interesting ops, then include everything.
if self._parameters.include_less_interesting_ops:
return True
return op_priority(op.type) <= self._parameters.trace_level
@staticmethod
@ -655,34 +630,14 @@ class TensorTracer(object):
- The op is at most _trace_ops_before_included hops before an included op
- The op is at most _trace_ops_after_included hops after an included op
"""
for opname_re in self._parameters.included_opname_re_list:
if opname_re.match(op.name):
return True
def _is_op_or_any_neighbor_included(op, check_before=0, check_after=0):
"""Helper function to check if op is included or not."""
for opname_re in self._parameters.included_opname_re_list:
if opname_re.match(op.name):
return True
for optype_re in self._parameters.included_optype_re_list:
if optype_re.match(op.type):
return True
if check_after > 0:
for out_tensor in op.outputs:
for consumer in out_tensor.consumers():
if _is_op_or_any_neighbor_included(consumer, check_after - 1, 0):
return True
if check_before > 0:
for input_tensor in op.inputs:
if _is_op_or_any_neighbor_included(input_tensor.op,
0,
check_before - 1):
return True
return False
# check_after and check_before are swapped below, as below operation
# checks the distance from an arbitrary op to included ops.
return _is_op_or_any_neighbor_included(
op, self._parameters.trace_ops_after_included,
self._parameters.trace_ops_before_included)
for optype_re in self._parameters.included_optype_re_list:
if optype_re.match(op.type):
return True
return False
def _is_user_excluded_op(self, op):
for opname_re in self._parameters.excluded_opname_re_list:
@ -726,20 +681,6 @@ class TensorTracer(object):
def _use_tensor_values_cache(self):
"""Returns True if immediate tensors should be first saved to a cache."""
if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
# For summary tace mode only compact format is supported.
return True
if self._parameters.trace_mode not in set([
tensor_tracer_flags.TRACE_MODE_NAN_INF,
tensor_tracer_flags.TRACE_MODE_NORM,
tensor_tracer_flags.TRACE_MODE_MAX_ABS,
tensor_tracer_flags.TRACE_MODE_SUMMARY
]):
return False
if (self._parameters.trace_dir and
_trace_files_need_precreated(self._parameters.trace_dir)):
return True
return self._parameters.use_compact_trace
def _use_tensor_buffer(self):
@ -898,26 +839,6 @@ class TensorTracer(object):
output_tensor = array_ops.reshape(output_tensor, [1])
return output_tensor
def _detect_inf_nan_producer(tensor):
"""Checks if the tensor is the first NaN/Inf tensor in the computation path."""
if tensor.op.inputs:
inp_check = [
_detect_nan_inf(inp_tensor) for inp_tensor in tensor.op.inputs
]
is_any_input_inf_nan = math_ops.add_n(inp_check)
else:
is_any_input_inf_nan = constant_op.constant(0, dtypes.bool)
is_current_tensor_inf_nan = _detect_nan_inf(tensor)
# An op is NaN/INF producer only when all inputs are nan/inf free (
# is_any_input_inf_nan = 0), and its output has nan/inf (
# is_current_tensor_inf_nan=1). Below will be 1 if op nan/inf is producer.
is_nan_producer = is_current_tensor_inf_nan - is_any_input_inf_nan
is_nan_producer = math_ops.reduce_any(is_nan_producer > 0)
return is_nan_producer
if (self._parameters.trace_mode ==
tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
return {self._parameters.trace_mode: _detect_inf_nan_producer(tensor)}
if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
if (self._parameters.trace_mode ==
@ -993,14 +914,15 @@ class TensorTracer(object):
Raises:
ValueError: If tensor_name is not already in
tensor_trace_order.tensorname_idx_map.
tensor_trace_order.tensorname_to_cache_idx.
"""
if self._parameters.is_brief_mode():
if tensor_name not in tensor_trace_order.tensorname_idx_map:
if tensor_name not in tensor_trace_order.tensorname_to_cache_idx:
raise ValueError(
'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
msg = '%d' % tensor_trace_order.tensorname_idx_map[tensor_name]
'Tensor name %s is not in the tensorname_to_cache_idx' %
tensor_name)
msg = '%d' % tensor_trace_order.tensorname_to_cache_idx[tensor_name]
else:
msg = '"%s"' % tensor_name
@ -1026,38 +948,6 @@ class TensorTracer(object):
return _print_tensor(tensor_name, -1, tensor, tensor)
def _show_full_tensors(tensor):
"""Prints the full tensor values for the tensors that are _trace_stack_size hops away from a given tensor."""
def _get_distance_k_tensors(k_before=0):
"""Returns the tensors that are at most k_before hops away from the tensor."""
if k_before < 0:
return []
visited_tensors = {tensor: 0}
visitor_queue = [tensor]
head = 0
while head < len(visitor_queue):
current_tensor = visitor_queue[head]
head += 1
distance = visited_tensors[current_tensor]
if distance == k_before:
break
for input_tensor in current_tensor.op.inputs:
if input_tensor in visited_tensors:
continue
visitor_queue.append(input_tensor)
visited_tensors[input_tensor] = distance + 1
return visitor_queue
tensors_to_print = _get_distance_k_tensors(
self._parameters.trace_stack_size)
print_ops = [_print_tensor(t.name, -1, t, t) for t in tensors_to_print]
with ops.control_dependencies(print_ops):
return constant_op.constant(True)
if (self._parameters.trace_mode ==
tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
return _show_full_tensors
if (self._parameters.trace_mode ==
tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
return _show_part_tensor
@ -1891,13 +1781,6 @@ class TensorTracer(object):
else:
return tensor_trace_fn(tensor)
def conditional_trace_fn(predicate_tensor, out_tensor, trace_fn,
out_tensor_name):
"""Creates a cond op that traces the out_tensor if predicate is satisfied."""
return control_flow_ops.cond(
predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
lambda: constant_op.constant(False)).op
if len(processed_tensors) != 1:
raise RuntimeError('Multiple stats are only allowed in compact '
'mode.')
@ -1905,20 +1788,7 @@ class TensorTracer(object):
# mode that uses compact format(self._use_tensor_values_cache = true).
# Non-compact mode currently allows single stat per tensor.
processed_out_tensor = six.next(six.itervalues(processed_tensors))
if self._parameters.is_conditional_trace:
trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
tpu_wrap_trace_fn, tensor_name)
elif self._parameters.included_cores:
should_print = constant_op.constant(False)
for core in self._parameters.included_cores:
should_print = gen_math_ops.logical_or(
should_print, gen_math_ops.equal(self._replica_id, core))
trace_op = conditional_trace_fn(should_print, processed_out_tensor,
tpu_wrap_trace_fn, tensor_name)
else:
trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
if op_control_flow_context:
# pylint: disable=protected-access

View File

@ -27,16 +27,17 @@ from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.platform import tf_logging as logging
TRACE_MODE_NAN_INF = 'nan-inf'
TRACE_MODE_PART_TENSOR = 'part-tensor'
TRACE_MODE_FULL_TENSOR = 'full-tensor'
TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
TRACE_MODE_NAN_INF = 'nan-inf'
TRACE_MODE_NORM = 'norm'
TRACE_MODE_MAX_ABS = 'max-abs'
TRACE_MODE_SUMMARY = 'summary'
# summary mode to collects a finite set of signatures for each traced tensor,
# (such as norm, max, min, mean) and dumps it using tb summaries.
TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
# Full tensor mode dumps the whole tensor values for the traced tensors without
# any processing on them; using tb summaries.
@ -49,20 +50,14 @@ _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
FLAG_NAME_ENABLE = 'enable'
FLAG_NAME_TRACE_MODE = 'trace_mode'
FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
FLAG_NAME_SUBMODE = 'submode'
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
FLAG_NAME_INCLUDED_CORES = 'included_cores'
FLAG_NAME_TRACE_LEVEL = 'trace_level'
FLAG_NAME_TRACE_DIR = 'trace_dir'
FLAG_NAME_REPORT_FILE = 'report_file'
@ -124,41 +119,21 @@ class TTParameters(object):
self.included_optype_re_list = self._flag_value_to_re_list(
FLAG_NAME_INCLUDED_OPTYPES)
self.is_conditional_trace = self._is_conditional_trace_mode()
self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
self.use_compact_trace = self.trace_mode in (TRACE_MODE_NAN_INF,
TRACE_MODE_NORM,
TRACE_MODE_MAX_ABS,
TRACE_MODE_SUMMARY)
self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
# _trace_ops_before_included and _trace_ops_after_included denotes to depth
# of tracing relative to the ops given in --included_opnames or
# --included_optypes
# For example, in the below graph
# op1 --> op2 --> op3 --> op4 --> op5
# If --included_opnames=op3 then only op3 will be traced.
# If also --trace_before_included_ops=2 (_trace_ops_before_included), then
# op1 and op2 will be traced as they are at most 2 hops apart from an
# included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
# will also be traced.
self.trace_ops_before_included = self._get_flag_int_value(
FLAG_NAME_TRACE_BEFORE_OPS, 0)
self.trace_ops_after_included = self._get_flag_int_value(
FLAG_NAME_TRACE_AFTER_OPS, 0)
self.trace_stack_size = self._get_flag_int_value(FLAG_NAME_TRACE_STACK_SIZE,
1)
_, self.graph_dump_path = self.get_flag_value(
FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS)
self.included_cores = self._flag_value_as_int_list(FLAG_NAME_INCLUDED_CORES)
self.include_less_interesting_ops = self.is_flag_on(
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
self.trace_level = self._get_flag_int_value(FLAG_NAME_TRACE_LEVEL,
_TT_DEFAULT_TRACE_LEVEL)
self.summary_signatures = self._get_summary_signatures()
self.collect_summary_per_core = self.is_flag_on(FLAG_NAME_SUMMARY_PER_CORE)
def _is_conditional_trace_mode(self):
return self.trace_mode == TRACE_MODE_FULL_IF_NAN
def _get_report_filepath(self):
"""Sets the path of the output report file."""
@ -205,7 +180,7 @@ class TTParameters(object):
trace_mode = TRACE_MODE_NORM
valid_trace_modes = [
TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN,
TRACE_MODE_NORM, TRACE_MODE_MAX_ABS,
TRACE_MODE_SUMMARY, TRACE_MODE_FULL_TENSOR_SUMMARY
]
if trace_mode not in valid_trace_modes:
@ -265,15 +240,14 @@ class TTParameters(object):
def _validate_flag_names(self):
"""Validates if the TensorTrace flags passed are valid."""
valid_flag_names = [
FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE, FLAG_NAME_USE_COMPACT_TRACE,
FLAG_NAME_TRACE_SCALAR_OPS, FLAG_NAME_TRACE_BEFORE_OPS,
FLAG_NAME_TRACE_AFTER_OPS, FLAG_NAME_TRACE_STACK_SIZE,
FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE,
FLAG_NAME_TRACE_SCALAR_OPS,
FLAG_NAME_SUBMODE, FLAG_NAME_EXCLUDED_OPNAMES,
FLAG_NAME_EXCLUDED_OPTYPES, FLAG_NAME_INCLUDED_OPNAMES,
FLAG_NAME_INCLUDED_OPTYPES, FLAG_NAME_TRACE_DIR,
FLAG_NAME_INCLUDED_CORES, FLAG_NAME_REPORT_FILE,
FLAG_NAME_REPORT_FILE,
FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
FLAG_NAME_OP_RANGE,
FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR

View File

@ -266,8 +266,6 @@ class TTReportHandle(object):
report.config.num_cores = tt_config.num_replicas
report.config.num_hosts = tt_config.num_hosts
report.config.num_cores_per_host = tt_config.num_replicas_per_host
for core in tt_parameters.included_cores:
report.config.included_cores.append(core)
report.config.submode = tt_parameters.submode
report.config.trace_mode = tt_parameters.trace_mode
@ -351,12 +349,8 @@ class TTReportHandle(object):
tt_parameters.trace_mode))
self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE,
tt_parameters.submode))
if tt_parameters.included_cores:
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
len(tt_parameters.included_cores)))
else:
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
tt_config.num_replicas))
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
tt_config.num_replicas))
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
tt_config.num_replicas_per_host))
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, tt_config.num_hosts))