STT-tensorflow/tensorflow/python/debug/lib/source_utils.py
2020-07-08 10:13:43 -07:00

380 lines
13 KiB
Python

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Classes and functions that help to inspect Python source w.r.t. TF graphs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import re
import zipfile
import absl
import numpy as np
from tensorflow.python.debug.lib import profiling
_TENSORFLOW_BASEDIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(
os.path.normpath(os.path.abspath(__file__))))))
_ABSL_BASEDIR = os.path.dirname(absl.__file__)
UNCOMPILED_SOURCE_SUFFIXES = (".py")
COMPILED_SOURCE_SUFFIXES = (".pyc", ".pyo")
def _norm_abs_path(file_path):
return os.path.normpath(os.path.abspath(file_path))
def is_extension_uncompiled_python_source(file_path):
_, extension = os.path.splitext(file_path)
return extension.lower() in UNCOMPILED_SOURCE_SUFFIXES
def is_extension_compiled_python_source(file_path):
_, extension = os.path.splitext(file_path)
return extension.lower() in COMPILED_SOURCE_SUFFIXES
def _convert_watch_key_to_tensor_name(watch_key):
return watch_key[:watch_key.rfind(":")]
def guess_is_tensorflow_py_library(py_file_path):
"""Guess whether a Python source file is a part of the tensorflow library.
Special cases:
1) Returns False for unit-test files in the library (*_test.py),
2) Returns False for files under python/debug/examples.
Args:
py_file_path: full path of the Python source file in question.
Returns:
(`bool`) Whether the file is inferred to be a part of the tensorflow
library.
"""
if (not is_extension_uncompiled_python_source(py_file_path) and
not is_extension_compiled_python_source(py_file_path)):
return False
py_file_path = _norm_abs_path(py_file_path)
return ((py_file_path.startswith(_TENSORFLOW_BASEDIR) or
py_file_path.startswith(_ABSL_BASEDIR)) and
not py_file_path.endswith("_test.py") and
(os.path.normpath("tensorflow/python/debug/examples") not in
os.path.normpath(py_file_path)))
def load_source(source_file_path):
"""Load the content of a Python source code file.
This function covers the following case:
1. source_file_path points to an existing Python (.py) file on the
file system.
2. source_file_path is a path within a .par file (i.e., a zip-compressed,
self-contained Python executable).
Args:
source_file_path: Path to the Python source file to read.
Returns:
A length-2 tuple:
- Lines of the source file, as a `list` of `str`s.
- The width of the string needed to show the line number in the file.
This is calculated based on the number of lines in the source file.
Raises:
IOError: if loading is unsuccessful.
"""
if os.path.isfile(source_file_path):
with open(source_file_path, "rb") as f:
source_text = f.read().decode("utf-8")
source_lines = source_text.split("\n")
else:
# One possible reason why the file doesn't exist is that it's a path
# inside a .par file. Try that possibility.
source_lines = _try_load_par_source(source_file_path)
if source_lines is None:
raise IOError(
"Source path neither exists nor can be loaded as a .par file: %s" %
source_file_path)
line_num_width = int(np.ceil(np.log10(len(source_lines)))) + 3
return source_lines, line_num_width
def _try_load_par_source(source_file_path):
"""Try loading the source code inside a .par file.
A .par file is a zip-compressed, self-contained Python executable.
It contains the content of individual Python source files that can
be read only through extracting from the zip file.
Args:
source_file_path: The full path to the file inside the .par file. This
path should include the path to the .par file itself, followed by the
intra-par path, e.g.,
"/tmp/my_executable.par/org-tensorflow/tensorflow/python/foo/bar.py".
Returns:
If successful, lines of the source file as a `list` of `str`s.
Else, `None`.
"""
prefix_path = source_file_path
while True:
prefix_path, basename = os.path.split(prefix_path)
if not basename:
break
suffix_path = os.path.normpath(
os.path.relpath(source_file_path, start=prefix_path))
if prefix_path.endswith(".par") and os.path.isfile(prefix_path):
with zipfile.ZipFile(prefix_path) as z:
norm_names = [os.path.normpath(name) for name in z.namelist()]
if suffix_path in norm_names:
with z.open(z.namelist()[norm_names.index(suffix_path)]) as zf:
source_text = zf.read().decode("utf-8")
return source_text.split("\n")
def annotate_source(dump,
source_file_path,
do_dumped_tensors=False,
file_stack_top=False,
min_line=None,
max_line=None):
"""Annotate a Python source file with a list of ops created at each line.
(The annotation doesn't change the source file itself.)
Args:
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
has been loaded.
source_file_path: (`str`) Path to the source file being annotated.
do_dumped_tensors: (`str`) Whether dumped Tensors, instead of ops are to be
used to annotate the source file.
file_stack_top: (`bool`) Whether only the top stack trace in the
specified source file is to be annotated.
min_line: (`None` or `int`) The 1-based line to start annotate the source
file from (inclusive).
max_line: (`None` or `int`) The 1-based line number to end the annotation
at (exclusive).
Returns:
A `dict` mapping 1-based line number to a list of op name(s) created at
that line, or tensor names if `do_dumped_tensors` is True.
Raises:
ValueError: If the dump object does not have a Python graph set.
"""
py_graph = dump.python_graph
if not py_graph:
raise ValueError("Cannot perform source annotation due to a lack of set "
"Python graph in the dump object")
source_file_path = _norm_abs_path(source_file_path)
line_to_op_names = {}
for op in py_graph.get_operations():
for file_path, line_number, _, _ in reversed(dump.node_traceback(op.name)):
if (min_line is not None and line_number < min_line or
max_line is not None and line_number >= max_line):
continue
if _norm_abs_path(file_path) != source_file_path:
continue
if do_dumped_tensors:
watch_keys = dump.debug_watch_keys(op.name)
# Convert watch keys to unique Tensor names.
items_to_append = list(
set(map(_convert_watch_key_to_tensor_name, watch_keys)))
else:
items_to_append = [op.name]
if line_number in line_to_op_names:
line_to_op_names[line_number].extend(items_to_append)
else:
line_to_op_names[line_number] = items_to_append
if file_stack_top:
break
return line_to_op_names
def list_source_files_against_dump(dump,
path_regex_allowlist=None,
node_name_regex_allowlist=None):
"""Generate a list of source files with information regarding ops and tensors.
Args:
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
has been loaded.
path_regex_allowlist: A regular-expression filter for source file path.
node_name_regex_allowlist: A regular-expression filter for node names.
Returns:
A list of tuples regarding the Python source files involved in constructing
the ops and tensors contained in `dump`. Each tuple is:
(source_file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
first_line)
is_tf_library: (`bool`) A guess of whether the file belongs to the
TensorFlow Python library.
num_nodes: How many nodes were created by lines of this source file.
These include nodes with dumps and those without.
num_tensors: How many Tensors were created by lines of this source file.
These include Tensors with dumps and those without.
num_dumps: How many debug Tensor dumps were from nodes (and Tensors)
that were created by this source file.
first_line: The first line number (1-based) that created any nodes or
Tensors in this source file.
The list is sorted by ascending order of source_file_path.
Raises:
ValueError: If the dump object does not have a Python graph set.
"""
py_graph = dump.python_graph
if not py_graph:
raise ValueError("Cannot generate source list due to a lack of set "
"Python graph in the dump object")
path_to_node_names = collections.defaultdict(set)
path_to_tensor_names = collections.defaultdict(set)
path_to_first_line = {}
tensor_name_to_num_dumps = {}
path_regex = (
re.compile(path_regex_allowlist) if path_regex_allowlist else None)
node_name_regex = (
re.compile(node_name_regex_allowlist)
if node_name_regex_allowlist else None)
to_skip_file_paths = set()
for op in py_graph.get_operations():
if node_name_regex and not node_name_regex.match(op.name):
continue
for file_path, line_number, _, _ in dump.node_traceback(op.name):
file_path = _norm_abs_path(file_path)
if (file_path in to_skip_file_paths or
path_regex and not path_regex.match(file_path) or
not os.path.isfile(file_path)):
to_skip_file_paths.add(file_path)
continue
path_to_node_names[file_path].add(op.name)
if file_path in path_to_first_line:
if path_to_first_line[file_path] > line_number:
path_to_first_line[file_path] = line_number
else:
path_to_first_line[file_path] = line_number
for output_tensor in op.outputs:
tensor_name = output_tensor.name
path_to_tensor_names[file_path].add(tensor_name)
watch_keys = dump.debug_watch_keys(op.name)
for watch_key in watch_keys:
node_name, output_slot, debug_op = watch_key.split(":")
tensor_name = "%s:%s" % (node_name, output_slot)
if tensor_name not in tensor_name_to_num_dumps:
tensor_name_to_num_dumps[tensor_name] = len(
dump.get_tensors(node_name, int(output_slot), debug_op))
path_to_num_dumps = {}
for path in path_to_tensor_names:
path_to_num_dumps[path] = sum(
tensor_name_to_num_dumps.get(tensor_name, 0)
for tensor_name in path_to_tensor_names[path])
output = []
for file_path in path_to_node_names:
output.append((
file_path,
guess_is_tensorflow_py_library(file_path),
len(path_to_node_names.get(file_path, {})),
len(path_to_tensor_names.get(file_path, {})),
path_to_num_dumps.get(file_path, 0),
path_to_first_line[file_path]))
return sorted(output, key=lambda x: x[0])
def annotate_source_against_profile(profile_data,
source_file_path,
node_name_filter=None,
op_type_filter=None,
min_line=None,
max_line=None):
"""Annotate a Python source file with profiling information at each line.
(The annotation doesn't change the source file itself.)
Args:
profile_data: (`list` of `ProfileDatum`) A list of `ProfileDatum`.
source_file_path: (`str`) Path to the source file being annotated.
node_name_filter: Regular expression to filter by node name.
op_type_filter: Regular expression to filter by op type.
min_line: (`None` or `int`) The 1-based line to start annotate the source
file from (inclusive).
max_line: (`None` or `int`) The 1-based line number to end the annotation
at (exclusive).
Returns:
A `dict` mapping 1-based line number to a the namedtuple
`profiling.LineOrFuncProfileSummary`.
"""
source_file_path = _norm_abs_path(source_file_path)
node_name_regex = re.compile(node_name_filter) if node_name_filter else None
op_type_regex = re.compile(op_type_filter) if op_type_filter else None
line_to_profile_summary = {}
for profile_datum in profile_data:
if not profile_datum.file_path:
continue
if _norm_abs_path(profile_datum.file_path) != source_file_path:
continue
if (min_line is not None and profile_datum.line_number < min_line or
max_line is not None and profile_datum.line_number >= max_line):
continue
if (node_name_regex and
not node_name_regex.match(profile_datum.node_exec_stats.node_name)):
continue
if op_type_regex and not op_type_regex.match(profile_datum.op_type):
continue
if profile_datum.line_number not in line_to_profile_summary:
line_to_profile_summary[profile_datum.line_number] = (
profiling.AggregateProfile(profile_datum))
else:
line_to_profile_summary[profile_datum.line_number].add(profile_datum)
return line_to_profile_summary