Removing some unused options from tensor tracer.
PiperOrigin-RevId: 333619299 Change-Id: Ib9303bf9f134f941a33636561ba266c34504fb3a
This commit is contained in:
parent
34231a62a6
commit
fbe4f6e8a4
tensorflow/python/tpu
@ -145,11 +145,7 @@ def set_parameters(tensor_tracer_params=None):
|
||||
- full_tensor_summary: Writes the full tensors as binary event files.
|
||||
The outputs can be read using: trace =
|
||||
tensor_tracer.read_tensor_tracer_event_file(event_file_path)
|
||||
- trace-back-if-nan: This mode will write the full tensor content only
|
||||
when the tensor has a NaN or Inf in it. It is possible to also print
|
||||
the inputs coming to this op using 'trace_stack_size' parameter.
|
||||
E.g., if trace_stack_size=2, then the tensor with NaN/Inf, its
|
||||
inputs, and its inputs' inputs will also be printed.
|
||||
|
||||
- report_file: Path to the metadata file that is written during graph
|
||||
construction. If not set, metadata will be printed to stdout during
|
||||
graph construction.
|
||||
@ -181,32 +177,14 @@ def set_parameters(tensor_tracer_params=None):
|
||||
'--included_optypes=some_op_type --excluded_optypes=*.' will trace
|
||||
only the ops with type 'some_op_type'
|
||||
Advanced Flags:
|
||||
- compact_trace: If not set, statistics per tensor is written as soon as
|
||||
they are executed. If set, then statistics for all traced tensors will
|
||||
be stored in a cache and will be written only once per step. This flag
|
||||
is ignored for full-tensor and part-tensor trace modes. If the
|
||||
trace_dir is a remote directory, compact_trace will be forced.
|
||||
- trace_scalar: Scalar values are not traced by default. If this flag is
|
||||
set, scalar values will also be traced.
|
||||
- included_cores: Accepts a list string. Tracing will only be dumped for
|
||||
these cores. E.g, setting it to '[0,2,4,6]' will result in a trace
|
||||
only for those cores.
|
||||
- op_range: In the form of '%d:%d' that limits the tracing to the ops
|
||||
within this limit. --op_range='5:10' will trace only the ops that have
|
||||
topological order between 5-10.
|
||||
- trace_before_included_ops: If set to a number-k, it will also trace
|
||||
distance-k inputs of each traced tensor. E.g., k=1, then in addition
|
||||
to each traced_tensor, their input tensors will also be traced.
|
||||
- trace_after_included_ops: Same as trace_before_included_ops, where it
|
||||
will also trace distance-k outputs of each traced tensor.
|
||||
- submode: 'brief' or 'detailed'. If the trace mode is not compact,
|
||||
brief mode will print only the id of each traced tensor to save some
|
||||
space. 'detailed' mode prints the full tensor name.
|
||||
- trace_stack_size: Used only for trace_mode=trace-back-if-nan mode. It
|
||||
determines how many ops to print back from a nan op. E.g, op4 -> op3
|
||||
-> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
|
||||
result of op1 will also be printed. trace_stack_size is 2, the result
|
||||
of op1 and op2 will be printed.
|
||||
- use_fingerprint_subdirectory: The trace directory will be chosen as
|
||||
using the fingerprint of the trace metadata under the provided
|
||||
trace_dir.
|
||||
@ -527,9 +505,6 @@ class TensorTracer(object):
|
||||
|
||||
def _is_interesting_op(self, op):
|
||||
"""Returns True if the given op is not an interesting one to be traced."""
|
||||
# If flag is set to include less interesting ops, then include everything.
|
||||
if self._parameters.include_less_interesting_ops:
|
||||
return True
|
||||
return op_priority(op.type) <= self._parameters.trace_level
|
||||
|
||||
@staticmethod
|
||||
@ -655,34 +630,14 @@ class TensorTracer(object):
|
||||
- The op is at most _trace_ops_before_included hops before an included op
|
||||
- The op is at most _trace_ops_after_included hops after an included op
|
||||
"""
|
||||
for opname_re in self._parameters.included_opname_re_list:
|
||||
if opname_re.match(op.name):
|
||||
return True
|
||||
|
||||
def _is_op_or_any_neighbor_included(op, check_before=0, check_after=0):
|
||||
"""Helper function to check if op is included or not."""
|
||||
for opname_re in self._parameters.included_opname_re_list:
|
||||
if opname_re.match(op.name):
|
||||
return True
|
||||
|
||||
for optype_re in self._parameters.included_optype_re_list:
|
||||
if optype_re.match(op.type):
|
||||
return True
|
||||
|
||||
if check_after > 0:
|
||||
for out_tensor in op.outputs:
|
||||
for consumer in out_tensor.consumers():
|
||||
if _is_op_or_any_neighbor_included(consumer, check_after - 1, 0):
|
||||
return True
|
||||
if check_before > 0:
|
||||
for input_tensor in op.inputs:
|
||||
if _is_op_or_any_neighbor_included(input_tensor.op,
|
||||
0,
|
||||
check_before - 1):
|
||||
return True
|
||||
return False
|
||||
# check_after and check_before are swapped below, as below operation
|
||||
# checks the distance from an arbitrary op to included ops.
|
||||
return _is_op_or_any_neighbor_included(
|
||||
op, self._parameters.trace_ops_after_included,
|
||||
self._parameters.trace_ops_before_included)
|
||||
for optype_re in self._parameters.included_optype_re_list:
|
||||
if optype_re.match(op.type):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_user_excluded_op(self, op):
|
||||
for opname_re in self._parameters.excluded_opname_re_list:
|
||||
@ -726,20 +681,6 @@ class TensorTracer(object):
|
||||
|
||||
def _use_tensor_values_cache(self):
|
||||
"""Returns True if immediate tensors should be first saved to a cache."""
|
||||
if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
|
||||
# For summary tace mode only compact format is supported.
|
||||
return True
|
||||
|
||||
if self._parameters.trace_mode not in set([
|
||||
tensor_tracer_flags.TRACE_MODE_NAN_INF,
|
||||
tensor_tracer_flags.TRACE_MODE_NORM,
|
||||
tensor_tracer_flags.TRACE_MODE_MAX_ABS,
|
||||
tensor_tracer_flags.TRACE_MODE_SUMMARY
|
||||
]):
|
||||
return False
|
||||
if (self._parameters.trace_dir and
|
||||
_trace_files_need_precreated(self._parameters.trace_dir)):
|
||||
return True
|
||||
return self._parameters.use_compact_trace
|
||||
|
||||
def _use_tensor_buffer(self):
|
||||
@ -898,26 +839,6 @@ class TensorTracer(object):
|
||||
output_tensor = array_ops.reshape(output_tensor, [1])
|
||||
return output_tensor
|
||||
|
||||
def _detect_inf_nan_producer(tensor):
|
||||
"""Checks if the tensor is the first NaN/Inf tensor in the computation path."""
|
||||
if tensor.op.inputs:
|
||||
inp_check = [
|
||||
_detect_nan_inf(inp_tensor) for inp_tensor in tensor.op.inputs
|
||||
]
|
||||
is_any_input_inf_nan = math_ops.add_n(inp_check)
|
||||
else:
|
||||
is_any_input_inf_nan = constant_op.constant(0, dtypes.bool)
|
||||
is_current_tensor_inf_nan = _detect_nan_inf(tensor)
|
||||
# An op is NaN/INF producer only when all inputs are nan/inf free (
|
||||
# is_any_input_inf_nan = 0), and its output has nan/inf (
|
||||
# is_current_tensor_inf_nan=1). Below will be 1 if op nan/inf is producer.
|
||||
is_nan_producer = is_current_tensor_inf_nan - is_any_input_inf_nan
|
||||
is_nan_producer = math_ops.reduce_any(is_nan_producer > 0)
|
||||
return is_nan_producer
|
||||
|
||||
if (self._parameters.trace_mode ==
|
||||
tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
|
||||
return {self._parameters.trace_mode: _detect_inf_nan_producer(tensor)}
|
||||
if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
|
||||
return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
|
||||
if (self._parameters.trace_mode ==
|
||||
@ -993,14 +914,15 @@ class TensorTracer(object):
|
||||
|
||||
Raises:
|
||||
ValueError: If tensor_name is not already in
|
||||
tensor_trace_order.tensorname_idx_map.
|
||||
tensor_trace_order.tensorname_to_cache_idx.
|
||||
"""
|
||||
|
||||
if self._parameters.is_brief_mode():
|
||||
if tensor_name not in tensor_trace_order.tensorname_idx_map:
|
||||
if tensor_name not in tensor_trace_order.tensorname_to_cache_idx:
|
||||
raise ValueError(
|
||||
'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
|
||||
msg = '%d' % tensor_trace_order.tensorname_idx_map[tensor_name]
|
||||
'Tensor name %s is not in the tensorname_to_cache_idx' %
|
||||
tensor_name)
|
||||
msg = '%d' % tensor_trace_order.tensorname_to_cache_idx[tensor_name]
|
||||
else:
|
||||
msg = '"%s"' % tensor_name
|
||||
|
||||
@ -1026,38 +948,6 @@ class TensorTracer(object):
|
||||
|
||||
return _print_tensor(tensor_name, -1, tensor, tensor)
|
||||
|
||||
def _show_full_tensors(tensor):
|
||||
"""Prints the full tensor values for the tensors that are _trace_stack_size hops away from a given tensor."""
|
||||
|
||||
def _get_distance_k_tensors(k_before=0):
|
||||
"""Returns the tensors that are at most k_before hops away from the tensor."""
|
||||
if k_before < 0:
|
||||
return []
|
||||
visited_tensors = {tensor: 0}
|
||||
visitor_queue = [tensor]
|
||||
head = 0
|
||||
while head < len(visitor_queue):
|
||||
current_tensor = visitor_queue[head]
|
||||
head += 1
|
||||
distance = visited_tensors[current_tensor]
|
||||
if distance == k_before:
|
||||
break
|
||||
for input_tensor in current_tensor.op.inputs:
|
||||
if input_tensor in visited_tensors:
|
||||
continue
|
||||
visitor_queue.append(input_tensor)
|
||||
visited_tensors[input_tensor] = distance + 1
|
||||
return visitor_queue
|
||||
|
||||
tensors_to_print = _get_distance_k_tensors(
|
||||
self._parameters.trace_stack_size)
|
||||
print_ops = [_print_tensor(t.name, -1, t, t) for t in tensors_to_print]
|
||||
with ops.control_dependencies(print_ops):
|
||||
return constant_op.constant(True)
|
||||
|
||||
if (self._parameters.trace_mode ==
|
||||
tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
|
||||
return _show_full_tensors
|
||||
if (self._parameters.trace_mode ==
|
||||
tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
|
||||
return _show_part_tensor
|
||||
@ -1891,13 +1781,6 @@ class TensorTracer(object):
|
||||
else:
|
||||
return tensor_trace_fn(tensor)
|
||||
|
||||
def conditional_trace_fn(predicate_tensor, out_tensor, trace_fn,
|
||||
out_tensor_name):
|
||||
"""Creates a cond op that traces the out_tensor if predicate is satisfied."""
|
||||
return control_flow_ops.cond(
|
||||
predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
|
||||
lambda: constant_op.constant(False)).op
|
||||
|
||||
if len(processed_tensors) != 1:
|
||||
raise RuntimeError('Multiple stats are only allowed in compact '
|
||||
'mode.')
|
||||
@ -1905,20 +1788,7 @@ class TensorTracer(object):
|
||||
# mode that uses compact format(self._use_tensor_values_cache = true).
|
||||
# Non-compact mode currently allows single stat per tensor.
|
||||
processed_out_tensor = six.next(six.itervalues(processed_tensors))
|
||||
|
||||
if self._parameters.is_conditional_trace:
|
||||
trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
|
||||
tpu_wrap_trace_fn, tensor_name)
|
||||
elif self._parameters.included_cores:
|
||||
should_print = constant_op.constant(False)
|
||||
for core in self._parameters.included_cores:
|
||||
should_print = gen_math_ops.logical_or(
|
||||
should_print, gen_math_ops.equal(self._replica_id, core))
|
||||
trace_op = conditional_trace_fn(should_print, processed_out_tensor,
|
||||
tpu_wrap_trace_fn, tensor_name)
|
||||
|
||||
else:
|
||||
trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
|
||||
trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
|
||||
|
||||
if op_control_flow_context:
|
||||
# pylint: disable=protected-access
|
||||
|
@ -27,16 +27,17 @@ from tensorflow.python.ops import linalg_ops
|
||||
from tensorflow.python.ops import math_ops
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
|
||||
TRACE_MODE_NAN_INF = 'nan-inf'
|
||||
TRACE_MODE_PART_TENSOR = 'part-tensor'
|
||||
TRACE_MODE_FULL_TENSOR = 'full-tensor'
|
||||
TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
|
||||
TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
|
||||
|
||||
TRACE_MODE_NAN_INF = 'nan-inf'
|
||||
TRACE_MODE_NORM = 'norm'
|
||||
TRACE_MODE_MAX_ABS = 'max-abs'
|
||||
TRACE_MODE_SUMMARY = 'summary'
|
||||
# summary mode to collects a finite set of signatures for each traced tensor,
|
||||
# (such as norm, max, min, mean) and dumps it using tb summaries.
|
||||
TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
|
||||
|
||||
# Full tensor mode dumps the whole tensor values for the traced tensors without
|
||||
# any processing on them; using tb summaries.
|
||||
|
||||
@ -49,20 +50,14 @@ _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
|
||||
_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
|
||||
|
||||
FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
|
||||
FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
|
||||
FLAG_NAME_ENABLE = 'enable'
|
||||
FLAG_NAME_TRACE_MODE = 'trace_mode'
|
||||
FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
|
||||
FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
|
||||
FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
|
||||
FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
|
||||
FLAG_NAME_SUBMODE = 'submode'
|
||||
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
|
||||
FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
|
||||
FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
|
||||
FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
|
||||
FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
|
||||
FLAG_NAME_INCLUDED_CORES = 'included_cores'
|
||||
FLAG_NAME_TRACE_LEVEL = 'trace_level'
|
||||
FLAG_NAME_TRACE_DIR = 'trace_dir'
|
||||
FLAG_NAME_REPORT_FILE = 'report_file'
|
||||
@ -124,41 +119,21 @@ class TTParameters(object):
|
||||
self.included_optype_re_list = self._flag_value_to_re_list(
|
||||
FLAG_NAME_INCLUDED_OPTYPES)
|
||||
|
||||
self.is_conditional_trace = self._is_conditional_trace_mode()
|
||||
self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
|
||||
self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
|
||||
self.use_compact_trace = self.trace_mode in (TRACE_MODE_NAN_INF,
|
||||
TRACE_MODE_NORM,
|
||||
TRACE_MODE_MAX_ABS,
|
||||
TRACE_MODE_SUMMARY)
|
||||
self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
|
||||
self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
|
||||
|
||||
# _trace_ops_before_included and _trace_ops_after_included denotes to depth
|
||||
# of tracing relative to the ops given in --included_opnames or
|
||||
# --included_optypes
|
||||
# For example, in the below graph
|
||||
# op1 --> op2 --> op3 --> op4 --> op5
|
||||
# If --included_opnames=op3 then only op3 will be traced.
|
||||
# If also --trace_before_included_ops=2 (_trace_ops_before_included), then
|
||||
# op1 and op2 will be traced as they are at most 2 hops apart from an
|
||||
# included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
|
||||
# will also be traced.
|
||||
self.trace_ops_before_included = self._get_flag_int_value(
|
||||
FLAG_NAME_TRACE_BEFORE_OPS, 0)
|
||||
self.trace_ops_after_included = self._get_flag_int_value(
|
||||
FLAG_NAME_TRACE_AFTER_OPS, 0)
|
||||
self.trace_stack_size = self._get_flag_int_value(FLAG_NAME_TRACE_STACK_SIZE,
|
||||
1)
|
||||
_, self.graph_dump_path = self.get_flag_value(
|
||||
FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS)
|
||||
self.included_cores = self._flag_value_as_int_list(FLAG_NAME_INCLUDED_CORES)
|
||||
self.include_less_interesting_ops = self.is_flag_on(
|
||||
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
|
||||
self.trace_level = self._get_flag_int_value(FLAG_NAME_TRACE_LEVEL,
|
||||
_TT_DEFAULT_TRACE_LEVEL)
|
||||
self.summary_signatures = self._get_summary_signatures()
|
||||
self.collect_summary_per_core = self.is_flag_on(FLAG_NAME_SUMMARY_PER_CORE)
|
||||
|
||||
def _is_conditional_trace_mode(self):
|
||||
return self.trace_mode == TRACE_MODE_FULL_IF_NAN
|
||||
|
||||
def _get_report_filepath(self):
|
||||
"""Sets the path of the output report file."""
|
||||
|
||||
@ -205,7 +180,7 @@ class TTParameters(object):
|
||||
trace_mode = TRACE_MODE_NORM
|
||||
valid_trace_modes = [
|
||||
TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
|
||||
TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN,
|
||||
TRACE_MODE_NORM, TRACE_MODE_MAX_ABS,
|
||||
TRACE_MODE_SUMMARY, TRACE_MODE_FULL_TENSOR_SUMMARY
|
||||
]
|
||||
if trace_mode not in valid_trace_modes:
|
||||
@ -265,15 +240,14 @@ class TTParameters(object):
|
||||
def _validate_flag_names(self):
|
||||
"""Validates if the TensorTrace flags passed are valid."""
|
||||
valid_flag_names = [
|
||||
FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE, FLAG_NAME_USE_COMPACT_TRACE,
|
||||
FLAG_NAME_TRACE_SCALAR_OPS, FLAG_NAME_TRACE_BEFORE_OPS,
|
||||
FLAG_NAME_TRACE_AFTER_OPS, FLAG_NAME_TRACE_STACK_SIZE,
|
||||
FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE,
|
||||
FLAG_NAME_TRACE_SCALAR_OPS,
|
||||
FLAG_NAME_SUBMODE, FLAG_NAME_EXCLUDED_OPNAMES,
|
||||
FLAG_NAME_EXCLUDED_OPTYPES, FLAG_NAME_INCLUDED_OPNAMES,
|
||||
FLAG_NAME_INCLUDED_OPTYPES, FLAG_NAME_TRACE_DIR,
|
||||
FLAG_NAME_INCLUDED_CORES, FLAG_NAME_REPORT_FILE,
|
||||
FLAG_NAME_REPORT_FILE,
|
||||
FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
|
||||
FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
|
||||
FLAG_NAME_OP_RANGE,
|
||||
FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
|
||||
FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
|
||||
FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
|
||||
|
@ -266,8 +266,6 @@ class TTReportHandle(object):
|
||||
report.config.num_cores = tt_config.num_replicas
|
||||
report.config.num_hosts = tt_config.num_hosts
|
||||
report.config.num_cores_per_host = tt_config.num_replicas_per_host
|
||||
for core in tt_parameters.included_cores:
|
||||
report.config.included_cores.append(core)
|
||||
report.config.submode = tt_parameters.submode
|
||||
report.config.trace_mode = tt_parameters.trace_mode
|
||||
|
||||
@ -351,12 +349,8 @@ class TTReportHandle(object):
|
||||
tt_parameters.trace_mode))
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE,
|
||||
tt_parameters.submode))
|
||||
if tt_parameters.included_cores:
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
|
||||
len(tt_parameters.included_cores)))
|
||||
else:
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
|
||||
tt_config.num_replicas))
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
|
||||
tt_config.num_replicas))
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
|
||||
tt_config.num_replicas_per_host))
|
||||
self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, tt_config.num_hosts))
|
||||
|
Loading…
Reference in New Issue
Block a user