Tensor tracer file updates.
PiperOrigin-RevId: 335448917 Change-Id: Iec74fa0fc76d64a696b40ea23117cfb064427c28
This commit is contained in:
parent
0c14b17c1f
commit
1ba52ab95e
tensorflow/python
@ -20,3 +20,11 @@ from __future__ import print_function
|
||||
|
||||
def get_default_communication_protocol():
|
||||
return 'grpc'
|
||||
|
||||
|
||||
def is_remote_path(_):
|
||||
return False
|
||||
|
||||
|
||||
def get_appendable_file_encoding():
|
||||
return ''
|
||||
|
@ -226,6 +226,7 @@ pytype_library(
|
||||
"//tensorflow/python:dtypes",
|
||||
"//tensorflow/python:framework",
|
||||
"//tensorflow/python:framework_ops",
|
||||
"//tensorflow/python:platform",
|
||||
"//tensorflow/python:platform_analytics",
|
||||
"//tensorflow/python:tensor_shape",
|
||||
"//tensorflow/python:tpu_ops_gen",
|
||||
|
@ -49,6 +49,7 @@ from tensorflow.python.ops import summary_ops_v2 as summary
|
||||
from tensorflow.python.ops import variable_scope
|
||||
from tensorflow.python.platform import analytics
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import remote_utils
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
from tensorflow.python.summary import summary_iterator
|
||||
from tensorflow.python.tpu import tensor_tracer_flags
|
||||
@ -344,24 +345,6 @@ def keras_layer_tracepoint(layer, checkpoint_name):
|
||||
return layer
|
||||
|
||||
|
||||
def _trace_files_need_precreated(output_dir):
|
||||
"""Return True if trace files must be pre-created by users."""
|
||||
|
||||
if not output_dir.startswith('/'):
|
||||
return False
|
||||
if len(output_dir) < 5:
|
||||
return False
|
||||
if output_dir[2] != 'n':
|
||||
return False
|
||||
if output_dir[3] != 's':
|
||||
return False
|
||||
if output_dir[1] != 'c':
|
||||
return False
|
||||
if output_dir[4] != '/':
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class TensorTracer(object):
|
||||
"""A software construct for tracing tensor values in a TF graph.
|
||||
|
||||
@ -927,7 +910,9 @@ class TensorTracer(object):
|
||||
msg = '"%s"' % tensor_name
|
||||
|
||||
if self._parameters.trace_dir:
|
||||
output_path = os.path.join(self._parameters.trace_dir, _TRACE_FILE_NAME)
|
||||
output_path = os.path.join(
|
||||
self._parameters.trace_dir,
|
||||
_TRACE_FILE_NAME + self._get_outfile_suffix())
|
||||
output_stream = _OUTPUT_STREAM_ESCAPE + output_path
|
||||
else:
|
||||
output_stream = sys.stderr
|
||||
@ -1229,20 +1214,10 @@ class TensorTracer(object):
|
||||
# Output files are handled by tf.summary operations, no need to precreate
|
||||
# them.
|
||||
return
|
||||
if _trace_files_need_precreated(self._parameters.trace_dir):
|
||||
for replica_id in range(0, self._tt_config.num_replicas):
|
||||
trace_file_path = os.path.join(
|
||||
self._parameters.trace_dir,
|
||||
_COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
|
||||
if not gfile.Exists(trace_file_path):
|
||||
raise RuntimeError(
|
||||
'%s must be pre-created with the '
|
||||
'appropriate properties.'%trace_file_path)
|
||||
else:
|
||||
if not gfile.Exists(self._parameters.trace_dir):
|
||||
file_io.recursive_create_dir(self._parameters.trace_dir)
|
||||
if not gfile.Exists(self._parameters.trace_dir):
|
||||
file_io.recursive_create_dir(self._parameters.trace_dir)
|
||||
if not gfile.Exists(self._parameters.trace_dir):
|
||||
raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
|
||||
raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
|
||||
|
||||
def _create_temp_cache(self, num_traced_tensors, num_signatures):
|
||||
"""Creates a temporary cache with the given dimensions.
|
||||
@ -1360,57 +1335,32 @@ class TensorTracer(object):
|
||||
|
||||
# No need to print core numbers if the cache is merged already.
|
||||
if self._parameters.collect_summary_per_core:
|
||||
print_op = logging_ops.print_v2(
|
||||
'\n',
|
||||
'core:',
|
||||
replica_id,
|
||||
',',
|
||||
'step:',
|
||||
step_num,
|
||||
'-->',
|
||||
step_error_message,
|
||||
'Printing tensors for mode:%s...' % self._parameters.trace_mode,
|
||||
summarize=-1,
|
||||
output_stream=output_stream)
|
||||
stats = ['\n\n', 'core:', replica_id, ',', 'step:', step_num, '-->',
|
||||
step_error_message,
|
||||
'Printing tensors for mode:%s...' % self._parameters.trace_mode]
|
||||
else:
|
||||
print_op = logging_ops.print_v2(
|
||||
'\n',
|
||||
'step:',
|
||||
step_num,
|
||||
'-->',
|
||||
step_error_message,
|
||||
'Printing tensors for mode:%s...' % self._parameters.trace_mode,
|
||||
summarize=-1,
|
||||
output_stream=output_stream)
|
||||
stats = ['\n\n', 'step:', step_num, '-->', step_error_message,
|
||||
'Printing tensors for mode:%s...' % self._parameters.trace_mode]
|
||||
|
||||
for tensor_name, cache_idx in sorted(
|
||||
tensor_trace_order.tensorname_to_cache_idx.items(),
|
||||
key=lambda item: item[1]):
|
||||
with ops.control_dependencies([print_op]):
|
||||
if self._parameters.collect_summary_per_core:
|
||||
print_op = logging_ops.print_v2(
|
||||
'\n',
|
||||
'core:',
|
||||
replica_id,
|
||||
',',
|
||||
'step:',
|
||||
step_num,
|
||||
',',
|
||||
tensor_name,
|
||||
'-->',
|
||||
_inspect_tensor(cache[cache_idx, 0]),
|
||||
summarize=-1, output_stream=output_stream)
|
||||
else:
|
||||
print_op = logging_ops.print_v2(
|
||||
'\n',
|
||||
'step:',
|
||||
step_num,
|
||||
',',
|
||||
tensor_name,
|
||||
'-->',
|
||||
_inspect_tensor(cache[cache_idx, 0]),
|
||||
summarize=-1, output_stream=output_stream)
|
||||
return print_op
|
||||
if self._parameters.collect_summary_per_core:
|
||||
stats.extend([
|
||||
'\n', 'core:', replica_id, ',', 'step:', step_num, ',',
|
||||
tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0])])
|
||||
else:
|
||||
stats.extend([
|
||||
'\n', 'step:', step_num, ',',
|
||||
tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0])])
|
||||
return logging_ops.print_v2(*stats, summarize=-1,
|
||||
output_stream=output_stream)
|
||||
|
||||
def _get_outfile_suffix(self):
|
||||
if remote_utils.is_remote_path(self._parameters.trace_dir):
|
||||
return remote_utils.get_appendable_file_encoding()
|
||||
else:
|
||||
return ''
|
||||
|
||||
def _generate_flush_cache_op(self, num_replicas, on_tpu, tensor_trace_order):
|
||||
"""Generates an Op that will flush the cache to file.
|
||||
@ -1435,7 +1385,7 @@ class TensorTracer(object):
|
||||
if self._parameters.trace_dir:
|
||||
output_path = (os.path.join(self._parameters.trace_dir,
|
||||
_COMPACT_TRACE_FILE_PREFIX)
|
||||
+ replica_str)
|
||||
+ replica_str + self._get_outfile_suffix())
|
||||
output_stream = _OUTPUT_STREAM_ESCAPE + output_path
|
||||
else:
|
||||
output_stream = sys.stderr
|
||||
|
Loading…
Reference in New Issue
Block a user