The helper function `guess_is_tensorflow_py_library`() throws errors for file paths that do not have expected extensions for Python source files. This is why it errors out when the file path is an empty string, which has been observed to happen by users. This CL makes the helper function not throw errors and return False for such malformed file paths instead. PiperOrigin-RevId: 315276482 Change-Id: Ib19750d502cd55fcfbef6f95a2064adc2e0816e9
379 lines
13 KiB
Python
379 lines
13 KiB
Python
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""Classes and functions that help to inspect Python source w.r.t. TF graphs."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import collections
|
|
import os
|
|
import re
|
|
import zipfile
|
|
|
|
import absl
|
|
import numpy as np
|
|
|
|
from tensorflow.python.debug.lib import profiling
|
|
|
|
|
|
_TENSORFLOW_BASEDIR = os.path.dirname(
|
|
os.path.dirname(os.path.dirname(os.path.dirname(
|
|
os.path.normpath(os.path.abspath(__file__))))))
|
|
|
|
_ABSL_BASEDIR = os.path.dirname(absl.__file__)
|
|
|
|
|
|
UNCOMPILED_SOURCE_SUFFIXES = (".py")
|
|
COMPILED_SOURCE_SUFFIXES = (".pyc", ".pyo")
|
|
|
|
|
|
def _norm_abs_path(file_path):
|
|
return os.path.normpath(os.path.abspath(file_path))
|
|
|
|
|
|
def is_extension_uncompiled_python_source(file_path):
|
|
_, extension = os.path.splitext(file_path)
|
|
return extension.lower() in UNCOMPILED_SOURCE_SUFFIXES
|
|
|
|
|
|
def is_extension_compiled_python_source(file_path):
|
|
_, extension = os.path.splitext(file_path)
|
|
return extension.lower() in COMPILED_SOURCE_SUFFIXES
|
|
|
|
|
|
def _convert_watch_key_to_tensor_name(watch_key):
|
|
return watch_key[:watch_key.rfind(":")]
|
|
|
|
|
|
def guess_is_tensorflow_py_library(py_file_path):
|
|
"""Guess whether a Python source file is a part of the tensorflow library.
|
|
|
|
Special cases:
|
|
1) Returns False for unit-test files in the library (*_test.py),
|
|
2) Returns False for files under python/debug/examples.
|
|
|
|
Args:
|
|
py_file_path: full path of the Python source file in question.
|
|
|
|
Returns:
|
|
(`bool`) Whether the file is inferred to be a part of the tensorflow
|
|
library.
|
|
"""
|
|
if (not is_extension_uncompiled_python_source(py_file_path) and
|
|
not is_extension_compiled_python_source(py_file_path)):
|
|
return False
|
|
py_file_path = _norm_abs_path(py_file_path)
|
|
return ((py_file_path.startswith(_TENSORFLOW_BASEDIR) or
|
|
py_file_path.startswith(_ABSL_BASEDIR)) and
|
|
not py_file_path.endswith("_test.py") and
|
|
(os.path.normpath("tensorflow/python/debug/examples") not in
|
|
os.path.normpath(py_file_path)))
|
|
|
|
|
|
def load_source(source_file_path):
|
|
"""Load the content of a Python source code file.
|
|
|
|
This function covers the following case:
|
|
1. source_file_path points to an existing Python (.py) file on the
|
|
file system.
|
|
2. source_file_path is a path within a .par file (i.e., a zip-compressed,
|
|
self-contained Python executable).
|
|
|
|
Args:
|
|
source_file_path: Path to the Python source file to read.
|
|
|
|
Returns:
|
|
A length-2 tuple:
|
|
- Lines of the source file, as a `list` of `str`s.
|
|
- The width of the string needed to show the line number in the file.
|
|
This is calculated based on the number of lines in the source file.
|
|
|
|
Raises:
|
|
IOError: if loading is unsuccessful.
|
|
"""
|
|
if os.path.isfile(source_file_path):
|
|
with open(source_file_path, "rb") as f:
|
|
source_text = f.read().decode("utf-8")
|
|
source_lines = source_text.split("\n")
|
|
else:
|
|
# One possible reason why the file doesn't exist is that it's a path
|
|
# inside a .par file. Try that possibility.
|
|
source_lines = _try_load_par_source(source_file_path)
|
|
if source_lines is None:
|
|
raise IOError(
|
|
"Source path neither exists nor can be loaded as a .par file: %s" %
|
|
source_file_path)
|
|
line_num_width = int(np.ceil(np.log10(len(source_lines)))) + 3
|
|
return source_lines, line_num_width
|
|
|
|
|
|
def _try_load_par_source(source_file_path):
|
|
"""Try loading the source code inside a .par file.
|
|
|
|
A .par file is a zip-compressed, self-contained Python executable.
|
|
It contains the content of individual Python source files that can
|
|
be read only through extracting from the zip file.
|
|
|
|
Args:
|
|
source_file_path: The full path to the file inside the .par file. This
|
|
path should include the path to the .par file itself, followed by the
|
|
intra-par path, e.g.,
|
|
"/tmp/my_executable.par/org-tensorflow/tensorflow/python/foo/bar.py".
|
|
|
|
Returns:
|
|
If successful, lines of the source file as a `list` of `str`s.
|
|
Else, `None`.
|
|
"""
|
|
prefix_path = source_file_path
|
|
while True:
|
|
prefix_path, basename = os.path.split(prefix_path)
|
|
if not basename:
|
|
break
|
|
suffix_path = os.path.normpath(
|
|
os.path.relpath(source_file_path, start=prefix_path))
|
|
if prefix_path.endswith(".par") and os.path.isfile(prefix_path):
|
|
with zipfile.ZipFile(prefix_path) as z:
|
|
norm_names = [os.path.normpath(name) for name in z.namelist()]
|
|
if suffix_path in norm_names:
|
|
with z.open(z.namelist()[norm_names.index(suffix_path)]) as zf:
|
|
source_text = zf.read().decode("utf-8")
|
|
return source_text.split("\n")
|
|
|
|
|
|
def annotate_source(dump,
|
|
source_file_path,
|
|
do_dumped_tensors=False,
|
|
file_stack_top=False,
|
|
min_line=None,
|
|
max_line=None):
|
|
"""Annotate a Python source file with a list of ops created at each line.
|
|
|
|
(The annotation doesn't change the source file itself.)
|
|
|
|
Args:
|
|
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
|
|
has been loaded.
|
|
source_file_path: (`str`) Path to the source file being annotated.
|
|
do_dumped_tensors: (`str`) Whether dumped Tensors, instead of ops are to be
|
|
used to annotate the source file.
|
|
file_stack_top: (`bool`) Whether only the top stack trace in the
|
|
specified source file is to be annotated.
|
|
min_line: (`None` or `int`) The 1-based line to start annotate the source
|
|
file from (inclusive).
|
|
max_line: (`None` or `int`) The 1-based line number to end the annotation
|
|
at (exclusive).
|
|
|
|
Returns:
|
|
A `dict` mapping 1-based line number to a list of op name(s) created at
|
|
that line, or tensor names if `do_dumped_tensors` is True.
|
|
|
|
Raises:
|
|
ValueError: If the dump object does not have a Python graph set.
|
|
"""
|
|
|
|
py_graph = dump.python_graph
|
|
if not py_graph:
|
|
raise ValueError("Cannot perform source annotation due to a lack of set "
|
|
"Python graph in the dump object")
|
|
|
|
source_file_path = _norm_abs_path(source_file_path)
|
|
|
|
line_to_op_names = {}
|
|
for op in py_graph.get_operations():
|
|
for file_path, line_number, _, _ in reversed(dump.node_traceback(op.name)):
|
|
if (min_line is not None and line_number < min_line or
|
|
max_line is not None and line_number >= max_line):
|
|
continue
|
|
|
|
if _norm_abs_path(file_path) != source_file_path:
|
|
continue
|
|
|
|
if do_dumped_tensors:
|
|
watch_keys = dump.debug_watch_keys(op.name)
|
|
# Convert watch keys to unique Tensor names.
|
|
items_to_append = list(
|
|
set(map(_convert_watch_key_to_tensor_name, watch_keys)))
|
|
else:
|
|
items_to_append = [op.name]
|
|
|
|
if line_number in line_to_op_names:
|
|
line_to_op_names[line_number].extend(items_to_append)
|
|
else:
|
|
line_to_op_names[line_number] = items_to_append
|
|
|
|
if file_stack_top:
|
|
break
|
|
|
|
return line_to_op_names
|
|
|
|
|
|
def list_source_files_against_dump(dump,
|
|
path_regex_whitelist=None,
|
|
node_name_regex_whitelist=None):
|
|
"""Generate a list of source files with information regarding ops and tensors.
|
|
|
|
Args:
|
|
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
|
|
has been loaded.
|
|
path_regex_whitelist: A regular-expression filter for source file path.
|
|
node_name_regex_whitelist: A regular-expression filter for node names.
|
|
|
|
Returns:
|
|
A list of tuples regarding the Python source files involved in constructing
|
|
the ops and tensors contained in `dump`. Each tuple is:
|
|
(source_file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
|
|
first_line)
|
|
|
|
is_tf_library: (`bool`) A guess of whether the file belongs to the
|
|
TensorFlow Python library.
|
|
num_nodes: How many nodes were created by lines of this source file.
|
|
These include nodes with dumps and those without.
|
|
num_tensors: How many Tensors were created by lines of this source file.
|
|
These include Tensors with dumps and those without.
|
|
num_dumps: How many debug Tensor dumps were from nodes (and Tensors)
|
|
that were created by this source file.
|
|
first_line: The first line number (1-based) that created any nodes or
|
|
Tensors in this source file.
|
|
|
|
The list is sorted by ascending order of source_file_path.
|
|
|
|
Raises:
|
|
ValueError: If the dump object does not have a Python graph set.
|
|
"""
|
|
|
|
py_graph = dump.python_graph
|
|
if not py_graph:
|
|
raise ValueError("Cannot generate source list due to a lack of set "
|
|
"Python graph in the dump object")
|
|
|
|
path_to_node_names = collections.defaultdict(set)
|
|
path_to_tensor_names = collections.defaultdict(set)
|
|
path_to_first_line = {}
|
|
tensor_name_to_num_dumps = {}
|
|
|
|
path_regex = (re.compile(path_regex_whitelist)
|
|
if path_regex_whitelist else None)
|
|
node_name_regex = (re.compile(node_name_regex_whitelist)
|
|
if node_name_regex_whitelist else None)
|
|
|
|
to_skip_file_paths = set()
|
|
for op in py_graph.get_operations():
|
|
if node_name_regex and not node_name_regex.match(op.name):
|
|
continue
|
|
|
|
for file_path, line_number, _, _ in dump.node_traceback(op.name):
|
|
file_path = _norm_abs_path(file_path)
|
|
if (file_path in to_skip_file_paths or
|
|
path_regex and not path_regex.match(file_path) or
|
|
not os.path.isfile(file_path)):
|
|
to_skip_file_paths.add(file_path)
|
|
continue
|
|
|
|
path_to_node_names[file_path].add(op.name)
|
|
if file_path in path_to_first_line:
|
|
if path_to_first_line[file_path] > line_number:
|
|
path_to_first_line[file_path] = line_number
|
|
else:
|
|
path_to_first_line[file_path] = line_number
|
|
|
|
for output_tensor in op.outputs:
|
|
tensor_name = output_tensor.name
|
|
path_to_tensor_names[file_path].add(tensor_name)
|
|
|
|
watch_keys = dump.debug_watch_keys(op.name)
|
|
for watch_key in watch_keys:
|
|
node_name, output_slot, debug_op = watch_key.split(":")
|
|
tensor_name = "%s:%s" % (node_name, output_slot)
|
|
if tensor_name not in tensor_name_to_num_dumps:
|
|
tensor_name_to_num_dumps[tensor_name] = len(
|
|
dump.get_tensors(node_name, int(output_slot), debug_op))
|
|
|
|
path_to_num_dumps = {}
|
|
for path in path_to_tensor_names:
|
|
path_to_num_dumps[path] = sum(
|
|
tensor_name_to_num_dumps.get(tensor_name, 0)
|
|
for tensor_name in path_to_tensor_names[path])
|
|
|
|
output = []
|
|
for file_path in path_to_node_names:
|
|
output.append((
|
|
file_path,
|
|
guess_is_tensorflow_py_library(file_path),
|
|
len(path_to_node_names.get(file_path, {})),
|
|
len(path_to_tensor_names.get(file_path, {})),
|
|
path_to_num_dumps.get(file_path, 0),
|
|
path_to_first_line[file_path]))
|
|
|
|
return sorted(output, key=lambda x: x[0])
|
|
|
|
|
|
def annotate_source_against_profile(profile_data,
|
|
source_file_path,
|
|
node_name_filter=None,
|
|
op_type_filter=None,
|
|
min_line=None,
|
|
max_line=None):
|
|
"""Annotate a Python source file with profiling information at each line.
|
|
|
|
(The annotation doesn't change the source file itself.)
|
|
|
|
Args:
|
|
profile_data: (`list` of `ProfileDatum`) A list of `ProfileDatum`.
|
|
source_file_path: (`str`) Path to the source file being annotated.
|
|
node_name_filter: Regular expression to filter by node name.
|
|
op_type_filter: Regular expression to filter by op type.
|
|
min_line: (`None` or `int`) The 1-based line to start annotate the source
|
|
file from (inclusive).
|
|
max_line: (`None` or `int`) The 1-based line number to end the annotation
|
|
at (exclusive).
|
|
|
|
Returns:
|
|
A `dict` mapping 1-based line number to a the namedtuple
|
|
`profiling.LineOrFuncProfileSummary`.
|
|
"""
|
|
|
|
source_file_path = _norm_abs_path(source_file_path)
|
|
|
|
node_name_regex = re.compile(node_name_filter) if node_name_filter else None
|
|
op_type_regex = re.compile(op_type_filter) if op_type_filter else None
|
|
|
|
line_to_profile_summary = {}
|
|
for profile_datum in profile_data:
|
|
if not profile_datum.file_path:
|
|
continue
|
|
|
|
if _norm_abs_path(profile_datum.file_path) != source_file_path:
|
|
continue
|
|
|
|
if (min_line is not None and profile_datum.line_number < min_line or
|
|
max_line is not None and profile_datum.line_number >= max_line):
|
|
continue
|
|
|
|
if (node_name_regex and
|
|
not node_name_regex.match(profile_datum.node_exec_stats.node_name)):
|
|
continue
|
|
|
|
if op_type_regex and not op_type_regex.match(profile_datum.op_type):
|
|
continue
|
|
|
|
if profile_datum.line_number not in line_to_profile_summary:
|
|
line_to_profile_summary[profile_datum.line_number] = (
|
|
profiling.AggregateProfile(profile_datum))
|
|
else:
|
|
line_to_profile_summary[profile_datum.line_number].add(profile_datum)
|
|
|
|
return line_to_profile_summary
|