Shanqing Cai 319df5224c [tfdbg2] Fix bug in which enable_check_numerics() errors due to empty file path
The helper function `guess_is_tensorflow_py_library`() throws errors for file
paths that do not have expected extensions for Python source files. This is why
it errors out when the file path is an empty string, which has been observed
to happen by users.

This CL makes the helper function not throw errors and return False for such
malformed file paths instead.

PiperOrigin-RevId: 315276482
Change-Id: Ib19750d502cd55fcfbef6f95a2064adc2e0816e9
2020-06-08 08:14:01 -07:00

379 lines
13 KiB
Python

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Classes and functions that help to inspect Python source w.r.t. TF graphs."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import re
import zipfile
import absl
import numpy as np
from tensorflow.python.debug.lib import profiling
_TENSORFLOW_BASEDIR = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.dirname(
os.path.normpath(os.path.abspath(__file__))))))
_ABSL_BASEDIR = os.path.dirname(absl.__file__)
UNCOMPILED_SOURCE_SUFFIXES = (".py")
COMPILED_SOURCE_SUFFIXES = (".pyc", ".pyo")
def _norm_abs_path(file_path):
return os.path.normpath(os.path.abspath(file_path))
def is_extension_uncompiled_python_source(file_path):
_, extension = os.path.splitext(file_path)
return extension.lower() in UNCOMPILED_SOURCE_SUFFIXES
def is_extension_compiled_python_source(file_path):
_, extension = os.path.splitext(file_path)
return extension.lower() in COMPILED_SOURCE_SUFFIXES
def _convert_watch_key_to_tensor_name(watch_key):
return watch_key[:watch_key.rfind(":")]
def guess_is_tensorflow_py_library(py_file_path):
"""Guess whether a Python source file is a part of the tensorflow library.
Special cases:
1) Returns False for unit-test files in the library (*_test.py),
2) Returns False for files under python/debug/examples.
Args:
py_file_path: full path of the Python source file in question.
Returns:
(`bool`) Whether the file is inferred to be a part of the tensorflow
library.
"""
if (not is_extension_uncompiled_python_source(py_file_path) and
not is_extension_compiled_python_source(py_file_path)):
return False
py_file_path = _norm_abs_path(py_file_path)
return ((py_file_path.startswith(_TENSORFLOW_BASEDIR) or
py_file_path.startswith(_ABSL_BASEDIR)) and
not py_file_path.endswith("_test.py") and
(os.path.normpath("tensorflow/python/debug/examples") not in
os.path.normpath(py_file_path)))
def load_source(source_file_path):
"""Load the content of a Python source code file.
This function covers the following case:
1. source_file_path points to an existing Python (.py) file on the
file system.
2. source_file_path is a path within a .par file (i.e., a zip-compressed,
self-contained Python executable).
Args:
source_file_path: Path to the Python source file to read.
Returns:
A length-2 tuple:
- Lines of the source file, as a `list` of `str`s.
- The width of the string needed to show the line number in the file.
This is calculated based on the number of lines in the source file.
Raises:
IOError: if loading is unsuccessful.
"""
if os.path.isfile(source_file_path):
with open(source_file_path, "rb") as f:
source_text = f.read().decode("utf-8")
source_lines = source_text.split("\n")
else:
# One possible reason why the file doesn't exist is that it's a path
# inside a .par file. Try that possibility.
source_lines = _try_load_par_source(source_file_path)
if source_lines is None:
raise IOError(
"Source path neither exists nor can be loaded as a .par file: %s" %
source_file_path)
line_num_width = int(np.ceil(np.log10(len(source_lines)))) + 3
return source_lines, line_num_width
def _try_load_par_source(source_file_path):
"""Try loading the source code inside a .par file.
A .par file is a zip-compressed, self-contained Python executable.
It contains the content of individual Python source files that can
be read only through extracting from the zip file.
Args:
source_file_path: The full path to the file inside the .par file. This
path should include the path to the .par file itself, followed by the
intra-par path, e.g.,
"/tmp/my_executable.par/org-tensorflow/tensorflow/python/foo/bar.py".
Returns:
If successful, lines of the source file as a `list` of `str`s.
Else, `None`.
"""
prefix_path = source_file_path
while True:
prefix_path, basename = os.path.split(prefix_path)
if not basename:
break
suffix_path = os.path.normpath(
os.path.relpath(source_file_path, start=prefix_path))
if prefix_path.endswith(".par") and os.path.isfile(prefix_path):
with zipfile.ZipFile(prefix_path) as z:
norm_names = [os.path.normpath(name) for name in z.namelist()]
if suffix_path in norm_names:
with z.open(z.namelist()[norm_names.index(suffix_path)]) as zf:
source_text = zf.read().decode("utf-8")
return source_text.split("\n")
def annotate_source(dump,
source_file_path,
do_dumped_tensors=False,
file_stack_top=False,
min_line=None,
max_line=None):
"""Annotate a Python source file with a list of ops created at each line.
(The annotation doesn't change the source file itself.)
Args:
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
has been loaded.
source_file_path: (`str`) Path to the source file being annotated.
do_dumped_tensors: (`str`) Whether dumped Tensors, instead of ops are to be
used to annotate the source file.
file_stack_top: (`bool`) Whether only the top stack trace in the
specified source file is to be annotated.
min_line: (`None` or `int`) The 1-based line to start annotate the source
file from (inclusive).
max_line: (`None` or `int`) The 1-based line number to end the annotation
at (exclusive).
Returns:
A `dict` mapping 1-based line number to a list of op name(s) created at
that line, or tensor names if `do_dumped_tensors` is True.
Raises:
ValueError: If the dump object does not have a Python graph set.
"""
py_graph = dump.python_graph
if not py_graph:
raise ValueError("Cannot perform source annotation due to a lack of set "
"Python graph in the dump object")
source_file_path = _norm_abs_path(source_file_path)
line_to_op_names = {}
for op in py_graph.get_operations():
for file_path, line_number, _, _ in reversed(dump.node_traceback(op.name)):
if (min_line is not None and line_number < min_line or
max_line is not None and line_number >= max_line):
continue
if _norm_abs_path(file_path) != source_file_path:
continue
if do_dumped_tensors:
watch_keys = dump.debug_watch_keys(op.name)
# Convert watch keys to unique Tensor names.
items_to_append = list(
set(map(_convert_watch_key_to_tensor_name, watch_keys)))
else:
items_to_append = [op.name]
if line_number in line_to_op_names:
line_to_op_names[line_number].extend(items_to_append)
else:
line_to_op_names[line_number] = items_to_append
if file_stack_top:
break
return line_to_op_names
def list_source_files_against_dump(dump,
path_regex_whitelist=None,
node_name_regex_whitelist=None):
"""Generate a list of source files with information regarding ops and tensors.
Args:
dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
has been loaded.
path_regex_whitelist: A regular-expression filter for source file path.
node_name_regex_whitelist: A regular-expression filter for node names.
Returns:
A list of tuples regarding the Python source files involved in constructing
the ops and tensors contained in `dump`. Each tuple is:
(source_file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
first_line)
is_tf_library: (`bool`) A guess of whether the file belongs to the
TensorFlow Python library.
num_nodes: How many nodes were created by lines of this source file.
These include nodes with dumps and those without.
num_tensors: How many Tensors were created by lines of this source file.
These include Tensors with dumps and those without.
num_dumps: How many debug Tensor dumps were from nodes (and Tensors)
that were created by this source file.
first_line: The first line number (1-based) that created any nodes or
Tensors in this source file.
The list is sorted by ascending order of source_file_path.
Raises:
ValueError: If the dump object does not have a Python graph set.
"""
py_graph = dump.python_graph
if not py_graph:
raise ValueError("Cannot generate source list due to a lack of set "
"Python graph in the dump object")
path_to_node_names = collections.defaultdict(set)
path_to_tensor_names = collections.defaultdict(set)
path_to_first_line = {}
tensor_name_to_num_dumps = {}
path_regex = (re.compile(path_regex_whitelist)
if path_regex_whitelist else None)
node_name_regex = (re.compile(node_name_regex_whitelist)
if node_name_regex_whitelist else None)
to_skip_file_paths = set()
for op in py_graph.get_operations():
if node_name_regex and not node_name_regex.match(op.name):
continue
for file_path, line_number, _, _ in dump.node_traceback(op.name):
file_path = _norm_abs_path(file_path)
if (file_path in to_skip_file_paths or
path_regex and not path_regex.match(file_path) or
not os.path.isfile(file_path)):
to_skip_file_paths.add(file_path)
continue
path_to_node_names[file_path].add(op.name)
if file_path in path_to_first_line:
if path_to_first_line[file_path] > line_number:
path_to_first_line[file_path] = line_number
else:
path_to_first_line[file_path] = line_number
for output_tensor in op.outputs:
tensor_name = output_tensor.name
path_to_tensor_names[file_path].add(tensor_name)
watch_keys = dump.debug_watch_keys(op.name)
for watch_key in watch_keys:
node_name, output_slot, debug_op = watch_key.split(":")
tensor_name = "%s:%s" % (node_name, output_slot)
if tensor_name not in tensor_name_to_num_dumps:
tensor_name_to_num_dumps[tensor_name] = len(
dump.get_tensors(node_name, int(output_slot), debug_op))
path_to_num_dumps = {}
for path in path_to_tensor_names:
path_to_num_dumps[path] = sum(
tensor_name_to_num_dumps.get(tensor_name, 0)
for tensor_name in path_to_tensor_names[path])
output = []
for file_path in path_to_node_names:
output.append((
file_path,
guess_is_tensorflow_py_library(file_path),
len(path_to_node_names.get(file_path, {})),
len(path_to_tensor_names.get(file_path, {})),
path_to_num_dumps.get(file_path, 0),
path_to_first_line[file_path]))
return sorted(output, key=lambda x: x[0])
def annotate_source_against_profile(profile_data,
source_file_path,
node_name_filter=None,
op_type_filter=None,
min_line=None,
max_line=None):
"""Annotate a Python source file with profiling information at each line.
(The annotation doesn't change the source file itself.)
Args:
profile_data: (`list` of `ProfileDatum`) A list of `ProfileDatum`.
source_file_path: (`str`) Path to the source file being annotated.
node_name_filter: Regular expression to filter by node name.
op_type_filter: Regular expression to filter by op type.
min_line: (`None` or `int`) The 1-based line to start annotate the source
file from (inclusive).
max_line: (`None` or `int`) The 1-based line number to end the annotation
at (exclusive).
Returns:
A `dict` mapping 1-based line number to a the namedtuple
`profiling.LineOrFuncProfileSummary`.
"""
source_file_path = _norm_abs_path(source_file_path)
node_name_regex = re.compile(node_name_filter) if node_name_filter else None
op_type_regex = re.compile(op_type_filter) if op_type_filter else None
line_to_profile_summary = {}
for profile_datum in profile_data:
if not profile_datum.file_path:
continue
if _norm_abs_path(profile_datum.file_path) != source_file_path:
continue
if (min_line is not None and profile_datum.line_number < min_line or
max_line is not None and profile_datum.line_number >= max_line):
continue
if (node_name_regex and
not node_name_regex.match(profile_datum.node_exec_stats.node_name)):
continue
if op_type_regex and not op_type_regex.match(profile_datum.op_type):
continue
if profile_datum.line_number not in line_to_profile_summary:
line_to_profile_summary[profile_datum.line_number] = (
profiling.AggregateProfile(profile_datum))
else:
line_to_profile_summary[profile_datum.line_number].add(profile_datum)
return line_to_profile_summary