Removing some unused options from tensor tracer.

PiperOrigin-RevId: 333619299 Change-Id: Ib9303bf9f134f941a33636561ba266c34504fb3a
2020-09-24 16:08:47 -07:00 · 2020-09-24 16:08:47 -07:00 · fbe4f6e8a4
commit fbe4f6e8a4
parent 34231a62a6
3 changed files with 29 additions and 191 deletions
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@ -145,11 +145,7 @@ def set_parameters(tensor_tracer_params=None):
          - full_tensor_summary: Writes the full tensors as binary event files.
            The outputs can be read using: trace =
              tensor_tracer.read_tensor_tracer_event_file(event_file_path)
-          - trace-back-if-nan: This mode will write the full tensor content only
-            when the tensor has a NaN or Inf in it. It is possible to also print
-            the inputs coming to this op using 'trace_stack_size' parameter.
-            E.g., if trace_stack_size=2, then the tensor with NaN/Inf, its
-            inputs, and its inputs' inputs will also be printed.
+
        - report_file: Path to the metadata file that is written during graph
          construction. If not set, metadata will be printed to stdout during
          graph construction.
@ -181,32 +177,14 @@ def set_parameters(tensor_tracer_params=None):
          '--included_optypes=some_op_type --excluded_optypes=*.' will trace
          only the ops with type 'some_op_type'
        Advanced Flags:
-        - compact_trace: If not set, statistics per tensor is written as soon as
-          they are executed. If set, then statistics for all traced tensors will
-          be stored in a cache and will be written only once per step. This flag
-          is ignored for full-tensor and part-tensor trace modes. If the
-          trace_dir is a remote directory, compact_trace will be forced.
        - trace_scalar: Scalar values are not traced by default. If this flag is
          set, scalar values will also be traced.
-        - included_cores: Accepts a list string. Tracing will only be dumped for
-          these cores. E.g, setting it to '[0,2,4,6]' will result in a trace
-          only for those cores.
        - op_range: In the form of '%d:%d' that limits the tracing to the ops
          within this limit. --op_range='5:10' will trace only the ops that have
            topological order between 5-10.
-        - trace_before_included_ops: If set to a number-k, it will also trace
-          distance-k inputs of each traced tensor. E.g., k=1, then in addition
-          to each traced_tensor, their input tensors will also be traced.
-        - trace_after_included_ops: Same as trace_before_included_ops, where it
-          will also trace distance-k outputs of each traced tensor.
        - submode: 'brief' or 'detailed'. If the trace mode is not compact,
          brief mode will print only the id of each traced tensor to save some
          space. 'detailed' mode prints the full tensor name.
-        - trace_stack_size: Used only for trace_mode=trace-back-if-nan mode. It
-          determines how many ops to print back from a nan op. E.g, op4 -> op3
-          -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
-          result of op1 will also be printed. trace_stack_size is 2, the result
-          of op1 and op2 will be printed.
        - use_fingerprint_subdirectory: The trace directory will be chosen as
          using the fingerprint of the trace metadata under the provided
          trace_dir.
@ -527,9 +505,6 @@ class TensorTracer(object):

  def _is_interesting_op(self, op):
    """Returns True if the given op is not an interesting one to be traced."""
-    # If flag is set to include less interesting ops, then include everything.
-    if self._parameters.include_less_interesting_ops:
-      return True
    return op_priority(op.type) <= self._parameters.trace_level

  @staticmethod
@ -655,34 +630,14 @@ class TensorTracer(object):
      - The op is at most _trace_ops_before_included hops before an included op
      - The op is at most _trace_ops_after_included hops after an included op
    """
+    for opname_re in self._parameters.included_opname_re_list:
+      if opname_re.match(op.name):
+        return True

-    def _is_op_or_any_neighbor_included(op, check_before=0, check_after=0):
-      """Helper function to check if op is included or not."""
-      for opname_re in self._parameters.included_opname_re_list:
-        if opname_re.match(op.name):
-          return True
-
-      for optype_re in self._parameters.included_optype_re_list:
-        if optype_re.match(op.type):
-          return True
-
-      if check_after > 0:
-        for out_tensor in op.outputs:
-          for consumer in out_tensor.consumers():
-            if _is_op_or_any_neighbor_included(consumer, check_after - 1, 0):
-              return True
-      if check_before > 0:
-        for input_tensor in op.inputs:
-          if _is_op_or_any_neighbor_included(input_tensor.op,
-                                             0,
-                                             check_before - 1):
-            return True
-      return False
-    # check_after and check_before are swapped below, as below operation
-    # checks the distance from an arbitrary op to included ops.
-    return _is_op_or_any_neighbor_included(
-        op, self._parameters.trace_ops_after_included,
-        self._parameters.trace_ops_before_included)
+    for optype_re in self._parameters.included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False

  def _is_user_excluded_op(self, op):
    for opname_re in self._parameters.excluded_opname_re_list:
@ -726,20 +681,6 @@ class TensorTracer(object):

  def _use_tensor_values_cache(self):
    """Returns True if immediate tensors should be first saved to a cache."""
-    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
-      # For summary tace mode only compact format is supported.
-      return True
-
-    if self._parameters.trace_mode not in set([
-        tensor_tracer_flags.TRACE_MODE_NAN_INF,
-        tensor_tracer_flags.TRACE_MODE_NORM,
-        tensor_tracer_flags.TRACE_MODE_MAX_ABS,
-        tensor_tracer_flags.TRACE_MODE_SUMMARY
-    ]):
-      return False
-    if (self._parameters.trace_dir and
-        _trace_files_need_precreated(self._parameters.trace_dir)):
-      return True
    return self._parameters.use_compact_trace

  def _use_tensor_buffer(self):
@ -898,26 +839,6 @@ class TensorTracer(object):
      output_tensor = array_ops.reshape(output_tensor, [1])
      return output_tensor

-    def _detect_inf_nan_producer(tensor):
-      """Checks if the tensor is the first NaN/Inf tensor in the computation path."""
-      if tensor.op.inputs:
-        inp_check = [
-            _detect_nan_inf(inp_tensor) for inp_tensor in tensor.op.inputs
-        ]
-        is_any_input_inf_nan = math_ops.add_n(inp_check)
-      else:
-        is_any_input_inf_nan = constant_op.constant(0, dtypes.bool)
-      is_current_tensor_inf_nan = _detect_nan_inf(tensor)
-      # An op is NaN/INF producer only when all inputs are nan/inf free (
-      # is_any_input_inf_nan = 0), and its output has nan/inf (
-      # is_current_tensor_inf_nan=1). Below will be 1 if op nan/inf is producer.
-      is_nan_producer = is_current_tensor_inf_nan - is_any_input_inf_nan
-      is_nan_producer = math_ops.reduce_any(is_nan_producer > 0)
-      return is_nan_producer
-
-    if (self._parameters.trace_mode ==
-        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
-      return {self._parameters.trace_mode: _detect_inf_nan_producer(tensor)}
    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
      return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
    if (self._parameters.trace_mode ==
@ -993,14 +914,15 @@ class TensorTracer(object):

      Raises:
        ValueError: If tensor_name is not already in
-                    tensor_trace_order.tensorname_idx_map.
+                    tensor_trace_order.tensorname_to_cache_idx.
      """

      if self._parameters.is_brief_mode():
-        if tensor_name not in tensor_trace_order.tensorname_idx_map:
+        if tensor_name not in tensor_trace_order.tensorname_to_cache_idx:
          raise ValueError(
-              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        msg = '%d' % tensor_trace_order.tensorname_idx_map[tensor_name]
+              'Tensor name %s is not in the tensorname_to_cache_idx' %
+              tensor_name)
+        msg = '%d' % tensor_trace_order.tensorname_to_cache_idx[tensor_name]
      else:
        msg = '"%s"' % tensor_name

@ -1026,38 +948,6 @@ class TensorTracer(object):

      return _print_tensor(tensor_name, -1, tensor, tensor)

-    def _show_full_tensors(tensor):
-      """Prints the full tensor values for the tensors that are _trace_stack_size hops away from a given tensor."""
-
-      def _get_distance_k_tensors(k_before=0):
-        """Returns the tensors that are at most k_before hops away from the tensor."""
-        if k_before < 0:
-          return []
-        visited_tensors = {tensor: 0}
-        visitor_queue = [tensor]
-        head = 0
-        while head < len(visitor_queue):
-          current_tensor = visitor_queue[head]
-          head += 1
-          distance = visited_tensors[current_tensor]
-          if distance == k_before:
-            break
-          for input_tensor in current_tensor.op.inputs:
-            if input_tensor in visited_tensors:
-              continue
-            visitor_queue.append(input_tensor)
-            visited_tensors[input_tensor] = distance + 1
-        return visitor_queue
-
-      tensors_to_print = _get_distance_k_tensors(
-          self._parameters.trace_stack_size)
-      print_ops = [_print_tensor(t.name, -1, t, t) for t in tensors_to_print]
-      with ops.control_dependencies(print_ops):
-        return constant_op.constant(True)
-
-    if (self._parameters.trace_mode ==
-        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
-      return _show_full_tensors
    if (self._parameters.trace_mode ==
        tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
      return _show_part_tensor
@ -1891,13 +1781,6 @@ class TensorTracer(object):
            else:
              return tensor_trace_fn(tensor)

-          def conditional_trace_fn(predicate_tensor, out_tensor, trace_fn,
-                                   out_tensor_name):
-            """Creates a cond op that traces the out_tensor if predicate is satisfied."""
-            return control_flow_ops.cond(
-                predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
-                lambda: constant_op.constant(False)).op
-
          if len(processed_tensors) != 1:
            raise RuntimeError('Multiple stats are only allowed in compact '
                               'mode.')
@ -1905,20 +1788,7 @@ class TensorTracer(object):
          # mode that uses compact format(self._use_tensor_values_cache = true).
          # Non-compact mode currently allows single stat per tensor.
          processed_out_tensor = six.next(six.itervalues(processed_tensors))
-
-          if self._parameters.is_conditional_trace:
-            trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
-                                            tpu_wrap_trace_fn, tensor_name)
-          elif self._parameters.included_cores:
-            should_print = constant_op.constant(False)
-            for core in self._parameters.included_cores:
-              should_print = gen_math_ops.logical_or(
-                  should_print, gen_math_ops.equal(self._replica_id, core))
-            trace_op = conditional_trace_fn(should_print, processed_out_tensor,
-                                            tpu_wrap_trace_fn, tensor_name)
-
-          else:
-            trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
+          trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)

        if op_control_flow_context:
          # pylint: disable=protected-access
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@ -27,16 +27,17 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging

-TRACE_MODE_NAN_INF = 'nan-inf'
 TRACE_MODE_PART_TENSOR = 'part-tensor'
 TRACE_MODE_FULL_TENSOR = 'full-tensor'
-TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
+TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
+
+TRACE_MODE_NAN_INF = 'nan-inf'
 TRACE_MODE_NORM = 'norm'
 TRACE_MODE_MAX_ABS = 'max-abs'
 TRACE_MODE_SUMMARY = 'summary'
 # summary mode to collects a finite set of signatures for each traced tensor,
 # (such as norm, max, min, mean) and dumps it using tb summaries.
-TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
+
 # Full tensor mode dumps the whole tensor values for the traced tensors without
 # any processing on them; using tb summaries.

@ -49,20 +50,14 @@ _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
 _FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')

 FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
 FLAG_NAME_ENABLE = 'enable'
 FLAG_NAME_TRACE_MODE = 'trace_mode'
-FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
 FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
-FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
-FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
 FLAG_NAME_SUBMODE = 'submode'
-FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
 FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
 FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
 FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
 FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
-FLAG_NAME_INCLUDED_CORES = 'included_cores'
 FLAG_NAME_TRACE_LEVEL = 'trace_level'
 FLAG_NAME_TRACE_DIR = 'trace_dir'
 FLAG_NAME_REPORT_FILE = 'report_file'
@ -124,41 +119,21 @@ class TTParameters(object):
    self.included_optype_re_list = self._flag_value_to_re_list(
        FLAG_NAME_INCLUDED_OPTYPES)

-    self.is_conditional_trace = self._is_conditional_trace_mode()
    self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
-    self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
+    self.use_compact_trace = self.trace_mode in (TRACE_MODE_NAN_INF,
+                                                 TRACE_MODE_NORM,
+                                                 TRACE_MODE_MAX_ABS,
+                                                 TRACE_MODE_SUMMARY)
    self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
    self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)

-    # _trace_ops_before_included and _trace_ops_after_included denotes to depth
-    # of tracing relative to the ops given in --included_opnames or
-    # --included_optypes
-    # For example, in the below graph
-    #                op1 --> op2 --> op3 --> op4 --> op5
-    # If --included_opnames=op3 then only op3 will be traced.
-    # If also --trace_before_included_ops=2 (_trace_ops_before_included), then
-    # op1 and op2 will be traced as they are at most 2 hops apart from an
-    # included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
-    # will also be traced.
-    self.trace_ops_before_included = self._get_flag_int_value(
-        FLAG_NAME_TRACE_BEFORE_OPS, 0)
-    self.trace_ops_after_included = self._get_flag_int_value(
-        FLAG_NAME_TRACE_AFTER_OPS, 0)
-    self.trace_stack_size = self._get_flag_int_value(FLAG_NAME_TRACE_STACK_SIZE,
-                                                     1)
    _, self.graph_dump_path = self.get_flag_value(
        FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS)
-    self.included_cores = self._flag_value_as_int_list(FLAG_NAME_INCLUDED_CORES)
-    self.include_less_interesting_ops = self.is_flag_on(
-        FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
    self.trace_level = self._get_flag_int_value(FLAG_NAME_TRACE_LEVEL,
                                                _TT_DEFAULT_TRACE_LEVEL)
    self.summary_signatures = self._get_summary_signatures()
    self.collect_summary_per_core = self.is_flag_on(FLAG_NAME_SUMMARY_PER_CORE)

-  def _is_conditional_trace_mode(self):
-    return self.trace_mode == TRACE_MODE_FULL_IF_NAN
-
  def _get_report_filepath(self):
    """Sets the path of the output report file."""

@ -205,7 +180,7 @@ class TTParameters(object):
      trace_mode = TRACE_MODE_NORM
    valid_trace_modes = [
        TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
-        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN,
+        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS,
        TRACE_MODE_SUMMARY, TRACE_MODE_FULL_TENSOR_SUMMARY
    ]
    if trace_mode not in valid_trace_modes:
@ -265,15 +240,14 @@ class TTParameters(object):
  def _validate_flag_names(self):
    """Validates if the TensorTrace flags passed are valid."""
    valid_flag_names = [
-        FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE, FLAG_NAME_USE_COMPACT_TRACE,
-        FLAG_NAME_TRACE_SCALAR_OPS, FLAG_NAME_TRACE_BEFORE_OPS,
-        FLAG_NAME_TRACE_AFTER_OPS, FLAG_NAME_TRACE_STACK_SIZE,
+        FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE,
+        FLAG_NAME_TRACE_SCALAR_OPS,
        FLAG_NAME_SUBMODE, FLAG_NAME_EXCLUDED_OPNAMES,
        FLAG_NAME_EXCLUDED_OPTYPES, FLAG_NAME_INCLUDED_OPNAMES,
        FLAG_NAME_INCLUDED_OPTYPES, FLAG_NAME_TRACE_DIR,
-        FLAG_NAME_INCLUDED_CORES, FLAG_NAME_REPORT_FILE,
+        FLAG_NAME_REPORT_FILE,
        FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
-        FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
+        FLAG_NAME_OP_RANGE,
        FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
        FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@ -266,8 +266,6 @@ class TTReportHandle(object):
    report.config.num_cores = tt_config.num_replicas
    report.config.num_hosts = tt_config.num_hosts
    report.config.num_cores_per_host = tt_config.num_replicas_per_host
-    for core in tt_parameters.included_cores:
-      report.config.included_cores.append(core)
    report.config.submode = tt_parameters.submode
    report.config.trace_mode = tt_parameters.trace_mode

@ -351,12 +349,8 @@ class TTReportHandle(object):
                                  tt_parameters.trace_mode))
    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE,
                                  tt_parameters.submode))
-    if tt_parameters.included_cores:
-      self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
-                                    len(tt_parameters.included_cores)))
-    else:
-      self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
-                                    tt_config.num_replicas))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
+                                  tt_config.num_replicas))
    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
                                  tt_config.num_replicas_per_host))
    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, tt_config.num_hosts))