diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto index ad5392d65fe..7b745f0f45b 100644 --- a/tensorflow/python/tpu/tensor_tracer.proto +++ b/tensorflow/python/tpu/tensor_tracer.proto @@ -21,6 +21,10 @@ message TensorTracerReport { // A map from tensor name to its TracedTensorDef. map tensordef = 3; + // The fingerprint of the TensorTracerReport (fingerprint calculation excludes + // this field and graphdef). + string fingerprint = 4; + message TensorTracerConfig { // Tensor tracer version, e.g. hostcall, outside compilation. string version = 1; diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py index bd96de42f3a..b4f99897094 100644 --- a/tensorflow/python/tpu/tensor_tracer.py +++ b/tensorflow/python/tpu/tensor_tracer.py @@ -100,7 +100,7 @@ _TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer' _TT_HOSTCALL_KEY = 'tensor_tracer_host_call' _TT_EVENT_FILE_SUFFIX = '.tensor_tracer' -_TT_SUMMARY_MAX_QUEUE = 100 +_TT_SUMMARY_MAX_QUEUE = 10 def set_parameters(tensor_tracer_params=None): @@ -206,6 +206,9 @@ def set_parameters(tensor_tracer_params=None): -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the result of op1 will also be printed. trace_stack_size is 2, the result of op1 and op2 will be printed. + - use_fingerprint_subdirectory: The trace directory will be chosen as + using the fingerprint of the trace metadata under the provided + trace_dir. """ flags = '--%s=1' % tensor_tracer_flags.FLAG_NAME_ENABLE if tensor_tracer_params: @@ -547,6 +550,7 @@ class TensorTracer(object): self._traced_op_names = set() self._report_proto = None self._temp_cache_var = [] + self._report_proto_path = '' def report_proto(self): """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes. @@ -564,6 +568,14 @@ class TensorTracer(object): 'Report proto only exists for ' 'trace_mode=[summary|full_tensor_summary]') + def report_proto_path(self): + """Getter for path where tensor_tracer.proto object should be written. + + Returns: + A string path. + """ + return self._report_proto_path + def _get_all_cache_variables(self): return self._cache_variables @@ -1366,6 +1378,13 @@ class TensorTracer(object): self._report_proto = report_handler.create_report_proto( self._tt_config, self._parameters, tensor_trace_order, tensor_trace_points, self._signature_types()) + if self._parameters.use_fingerprint_subdir: + self._parameters.trace_dir = os.path.join( + self._parameters.trace_dir, self._report_proto.fingerprint) + logging.info('TensorTracer updating trace_dir to %s', + self._parameters.trace_dir) + self._report_proto_path = tensor_tracer_report.report_proto_path( + self._parameters.trace_dir) if self._parameters.report_file_path != _SKIP_REPORT_FILE: report_handler.write_report_proto(self._report_proto, self._parameters) else: diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py index c5e3e88597b..4e412c46e82 100644 --- a/tensorflow/python/tpu/tensor_tracer_flags.py +++ b/tensorflow/python/tpu/tensor_tracer_flags.py @@ -74,6 +74,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs' FLAG_NAME_SUMMARY_SIGNATURES = 'signatures' FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core' FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache' +FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory' _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)') _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR' @@ -127,6 +128,7 @@ class TTParameters(object): self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS) self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE) self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR) + self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR) # _trace_ops_before_included and _trace_ops_after_included denotes to depth # of tracing relative to the ops given in --included_opnames or @@ -274,7 +276,7 @@ class TTParameters(object): FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE, FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL, FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE, - FLAG_NAME_TEMP_CACHE_VAR + FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR ] tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR) if not tensor_tracer_flags: diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py index e8a122d981f..3270b2a2fd3 100644 --- a/tensorflow/python/tpu/tensor_tracer_report.py +++ b/tensorflow/python/tpu/tensor_tracer_report.py @@ -19,8 +19,10 @@ from __future__ import division from __future__ import print_function import collections +import hashlib import os + from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.tpu import tensor_tracer_pb2 @@ -53,6 +55,18 @@ _CURRENT_VERSION = 'use-outside-compilation' _TT_REPORT_PROTO = 'tensor_tracer_report.report_pb' +def report_proto_path(trace_dir): + """Returns the path where report proto should be written. + + Args: + trace_dir: String denoting the trace directory. + + Returns: + A string denoting the path to the report proto. + """ + return os.path.join(trace_dir, _TT_REPORT_PROTO) + + def topological_sort(g): """Performs topological sort on the given graph. @@ -206,6 +220,12 @@ class OpenReportFile(object): self._report_file.close() +def proto_fingerprint(message_proto): + serialized_message = message_proto.SerializeToString() + hasher = hashlib.sha256(serialized_message) + return hasher.hexdigest() + + class TTReportHandle(object): """Utility class responsible from creating a tensor tracer report.""" @@ -255,8 +275,6 @@ class TTReportHandle(object): key=lambda x: x[1]): report.config.signatures.append(signature_name) - tf_graph = tensor_trace_order.graph_order.graph - report.graphdef.CopyFrom(tf_graph.as_graph_def()) for tensor in tensor_trace_order.graph_order.tensors: tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef() tensor_def.name = tensor.name @@ -265,6 +283,11 @@ class TTReportHandle(object): tensor_def.cache_index = ( tensor_trace_order.tensorname_to_cache_idx[tensor.name]) else: + # To prevent small changes affecting the fingerprint calculation, avoid + # writing the untraced tensors to metadata. Fingerprints will be + # different only when the list of the traced tensors are different. + if tt_parameters.use_fingerprint_subdir: + continue tensor_def.is_traced = False if tensor.name in tensor_trace_points: @@ -274,12 +297,17 @@ class TTReportHandle(object): elif tensor.op.name in self.instrument_records: tensor_def.explanation = self.instrument_records[tensor.op.name] report.tensordef[tensor.name].CopyFrom(tensor_def) + report.fingerprint = proto_fingerprint(report) + logging.info('TensorTracerProto fingerprint is %s.', + report.fingerprint) + tf_graph = tensor_trace_order.graph_order.graph + report.graphdef.CopyFrom(tf_graph.as_graph_def()) return report def write_report_proto(self, report_proto, tt_parameters): """Writes the given report proto under trace_dir.""" gfile.MakeDirs(tt_parameters.trace_dir) - report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO) + report_path = report_proto_path(tt_parameters.trace_dir) with gfile.GFile(report_path, 'wb') as f: f.write(report_proto.SerializeToString())