Implement async Python stack trace error reporting.

PiperOrigin-RevId: 322211018
Change-Id: I0d02e98748cc79cca04dc71fc828990548fce70f
This commit is contained in:
Kibeom Kim 2020-07-20 13:17:25 -07:00 committed by TensorFlower Gardener
parent 6ee2d328fd
commit fa85309fb5
18 changed files with 288 additions and 31 deletions

View File

@ -308,6 +308,7 @@ cc_library(
"//tensorflow/core:framework", "//tensorflow/core:framework",
"//tensorflow/core:lib", "//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc", "//tensorflow/core:protos_all_cc",
"//tensorflow/core/util:abstract_stack_trace",
"@com_google_absl//absl/types:span", "@com_google_absl//absl/types:span",
], ],
) )

View File

@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/platform/casts.h" #include "tensorflow/core/platform/casts.h"
#include "tensorflow/core/platform/status.h" #include "tensorflow/core/platform/status.h"
#include "tensorflow/core/util/abstract_stack_trace.h"
struct TFE_Op; struct TFE_Op;
@ -44,6 +45,12 @@ class ImmediateExecutionOperation : public AbstractOperation {
// Experimental // Experimental
virtual Status SetUseXla(bool enable) = 0; virtual Status SetUseXla(bool enable) = 0;
// Set stack trace to be used for potential async error reporting.
virtual void SetStackTrace(AbstractStackTrace stack_trace) = 0;
// Returns the stack trace set by `SetStackTrace` if exists.
virtual absl::optional<AbstractStackTrace> GetStackTrace() = 0;
// For LLVM style RTTI. // For LLVM style RTTI.
static bool classof(const AbstractOperation* ptr) { static bool classof(const AbstractOperation* ptr) {
return ptr->getKind() == kEager || ptr->getKind() == kTfrt; return ptr->getKind() == kEager || ptr->getKind() == kTfrt;

View File

@ -163,6 +163,7 @@ tf_cuda_library(
"//tensorflow/core:protos_all_cc", "//tensorflow/core:protos_all_cc",
"//tensorflow/core:lib", "//tensorflow/core:lib",
"//tensorflow/core/platform:platform_port", "//tensorflow/core/platform:platform_port",
"//tensorflow/core/util:abstract_stack_trace",
] + select({ ] + select({
"//tensorflow:android": [ "//tensorflow:android": [
"//tensorflow/core:portable_tensorflow_lib_lite", "//tensorflow/core:portable_tensorflow_lib_lite",

View File

@ -306,6 +306,7 @@ Status EagerOperation::Reset(
} }
attrs_.Reset(op); attrs_.Reset(op);
use_xla_ = false; use_xla_ = false;
stack_trace_.reset();
is_function_ = is_function; is_function_ = is_function;
cancellation_manager_ = nullptr; cancellation_manager_ = nullptr;
executor_ = executor ? executor : &ctx_.Executor(); executor_ = executor ? executor : &ctx_.Executor();

View File

@ -29,6 +29,7 @@ limitations under the License.
#include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/framework/cancellation.h"
#include "tensorflow/core/framework/device_attributes.pb.h" #include "tensorflow/core/framework/device_attributes.pb.h"
#include "tensorflow/core/framework/op_def.pb.h" #include "tensorflow/core/framework/op_def.pb.h"
#include "tensorflow/core/util/abstract_stack_trace.h"
#include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/device_name_utils.h"
namespace tensorflow { namespace tensorflow {
@ -120,6 +121,14 @@ class EagerOperation : public ImmediateExecutionOperation {
Status SetUseXla(bool enable) override; Status SetUseXla(bool enable) override;
void SetStackTrace(AbstractStackTrace stack_trace) override {
stack_trace_ = stack_trace;
}
absl::optional<AbstractStackTrace> GetStackTrace() override {
return stack_trace_;
}
Status Reset(const char* op, const char* device_name, bool remote, Status Reset(const char* op, const char* device_name, bool remote,
EagerExecutor* executor, EagerExecutor* executor,
const absl::optional<EagerRemoteFunctionParams> const absl::optional<EagerRemoteFunctionParams>
@ -218,6 +227,7 @@ class EagerOperation : public ImmediateExecutionOperation {
VariantDevice device_; VariantDevice device_;
bool use_xla_ = false; bool use_xla_ = false;
absl::optional<AbstractStackTrace> stack_trace_;
bool is_function_; // Conceptually const, but can't be because of Reset bool is_function_; // Conceptually const, but can't be because of Reset
bool colocation_exempt_; bool colocation_exempt_;
CancellationManager* cancellation_manager_ = nullptr; // Not owned. CancellationManager* cancellation_manager_ = nullptr; // Not owned.

View File

@ -634,7 +634,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
auto node = absl::make_unique<AsyncExecuteNode>( auto node = absl::make_unique<AsyncExecuteNode>(
&ctx, op->Inputs(), op->remote_func_params(), std::move(kernel), &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
graph_collector, op->GetCancellationManager(), graph_collector, op->GetCancellationManager(),
absl::Span<TensorHandle*>(retvals, num_outputs)); absl::Span<TensorHandle*>(retvals, num_outputs), op->GetStackTrace());
// Release the inputs from the eager operation since the AsyncExecuteNode // Release the inputs from the eager operation since the AsyncExecuteNode
// would have taken ownership. This allows the inputs to be forwarded if // would have taken ownership. This allows the inputs to be forwarded if
// possible. // possible.

View File

@ -150,14 +150,16 @@ class AsyncExecuteNode : public EagerNode {
core::RefCountPtr<KernelAndDevice> kernel, core::RefCountPtr<KernelAndDevice> kernel,
GraphCollector* graph_collector, GraphCollector* graph_collector,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals) absl::Span<TensorHandle*> retvals,
absl::optional<AbstractStackTrace> stack_trace)
: EagerNode(), : EagerNode(),
ctx_(ctx), ctx_(ctx),
inputs_(inputs), inputs_(inputs),
remote_func_params_(remote_func_params), remote_func_params_(remote_func_params),
kernel_(std::move(kernel)), kernel_(std::move(kernel)),
graph_collector_(graph_collector), graph_collector_(graph_collector),
cancellation_manager_(cancellation_manager) { cancellation_manager_(cancellation_manager),
stack_trace_(stack_trace) {
// Copy the output handles, since the container for them might get // Copy the output handles, since the container for them might get
// destroyed. // destroyed.
for (auto handle : retvals) { for (auto handle : retvals) {
@ -194,10 +196,14 @@ class AsyncExecuteNode : public EagerNode {
} }
++i; ++i;
} }
const Status status = EagerKernelExecute( Status status = EagerKernelExecute(
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_, ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
cancellation_manager_, absl::MakeSpan(retvals_)); cancellation_manager_, absl::MakeSpan(retvals_));
if (!status.ok()) { if (!status.ok()) {
if (stack_trace_.has_value()) {
status = Status(status.code(), status.error_message(),
stack_trace_->ToStackFrames());
}
Abort(status); Abort(status);
return status; return status;
} }
@ -227,6 +233,7 @@ class AsyncExecuteNode : public EagerNode {
core::RefCountPtr<KernelAndDevice> kernel_; core::RefCountPtr<KernelAndDevice> kernel_;
GraphCollector* graph_collector_; GraphCollector* graph_collector_;
CancellationManager* const cancellation_manager_; CancellationManager* const cancellation_manager_;
absl::optional<AbstractStackTrace> stack_trace_;
absl::InlinedVector<TensorHandle*, 2> retvals_; absl::InlinedVector<TensorHandle*, 2> retvals_;
}; };

View File

@ -62,9 +62,11 @@ inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) {
// to be several layers of additional context. // to be several layers of additional context.
template <typename... Args> template <typename... Args>
void AppendToMessage(::tensorflow::Status* status, Args... args) { void AppendToMessage(::tensorflow::Status* status, Args... args) {
std::vector<StackFrame> stack_trace = status->stack_trace();
*status = ::tensorflow::Status( *status = ::tensorflow::Status(
status->code(), status->code(),
::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...)); ::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...),
std::move(stack_trace));
} }
// For propagating errors when calling a function. // For propagating errors when calling a function.

View File

@ -89,11 +89,13 @@ class StatusLogSink : public TFLogSink {
} // namespace } // namespace
Status::Status(tensorflow::error::Code code, StringPiece msg) { Status::Status(tensorflow::error::Code code, tensorflow::StringPiece msg,
std::vector<StackFrame>&& stack_trace) {
assert(code != tensorflow::error::OK); assert(code != tensorflow::error::OK);
state_ = std::unique_ptr<State>(new State); state_ = std::unique_ptr<State>(new State);
state_->code = code; state_->code = code;
state_->msg = string(msg); state_->msg = string(msg);
state_->stack_trace = std::move(stack_trace);
VLOG(5) << "Generated non-OK status: \"" << *this << "\". " VLOG(5) << "Generated non-OK status: \"" << *this << "\". "
<< CurrentStackTrace(); << CurrentStackTrace();
} }
@ -117,6 +119,11 @@ const string& Status::empty_string() {
return *empty; return *empty;
} }
const std::vector<StackFrame>& Status::empty_stack_trace() {
static std::vector<StackFrame>* empty = new std::vector<StackFrame>();
return *empty;
}
string error_name(error::Code code) { string error_name(error::Code code) {
switch (code) { switch (code) {
case tensorflow::error::OK: case tensorflow::error::OK:

View File

@ -29,6 +29,13 @@ limitations under the License.
namespace tensorflow { namespace tensorflow {
// A struct representing a frame in a stack trace.
struct StackFrame {
std::string file_name;
int line_number;
std::string function_name;
};
#if defined(__clang__) #if defined(__clang__)
// Only clang supports warn_unused_result as a type annotation. // Only clang supports warn_unused_result as a type annotation.
class TF_MUST_USE_RESULT Status; class TF_MUST_USE_RESULT Status;
@ -43,7 +50,15 @@ class Status {
/// \brief Create a status with the specified error code and msg as a /// \brief Create a status with the specified error code and msg as a
/// human-readable string containing more detailed information. /// human-readable string containing more detailed information.
Status(tensorflow::error::Code code, tensorflow::StringPiece msg); Status(tensorflow::error::Code code, tensorflow::StringPiece msg)
: Status(code, msg, {}) {}
/// \brief Create a status with the specified error code, msg, and stack trace
/// as a human-readable string containing more detailed information.
#ifndef SWIG
Status(tensorflow::error::Code code, tensorflow::StringPiece msg,
std::vector<StackFrame>&& stack_trace);
#endif
/// Copy the specified status. /// Copy the specified status.
Status(const Status& s); Status(const Status& s);
@ -66,6 +81,10 @@ class Status {
return ok() ? empty_string() : state_->msg; return ok() ? empty_string() : state_->msg;
} }
const std::vector<StackFrame>& stack_trace() const {
return ok() ? empty_stack_trace() : state_->stack_trace;
}
bool operator==(const Status& x) const; bool operator==(const Status& x) const;
bool operator!=(const Status& x) const; bool operator!=(const Status& x) const;
@ -91,9 +110,11 @@ class Status {
private: private:
static const std::string& empty_string(); static const std::string& empty_string();
static const std::vector<StackFrame>& empty_stack_trace();
struct State { struct State {
tensorflow::error::Code code; tensorflow::error::Code code;
std::string msg; std::string msg;
std::vector<StackFrame> stack_trace;
}; };
// OK status has a `NULL` state_. Otherwise, `state_` points to // OK status has a `NULL` state_. Otherwise, `state_` points to
// a `State` structure containing the error code and message(s) // a `State` structure containing the error code and message(s)

View File

@ -61,6 +61,7 @@ filegroup(
filegroup( filegroup(
name = "mobile_srcs_only_runtime", name = "mobile_srcs_only_runtime",
srcs = [ srcs = [
"abstract_stack_trace.h",
"batch_util.cc", "batch_util.cc",
"batch_util.h", "batch_util.h",
"bcast.cc", "bcast.cc",
@ -313,6 +314,7 @@ filegroup(
filegroup( filegroup(
name = "framework_srcs", name = "framework_srcs",
srcs = [ srcs = [
"abstract_stack_trace.h",
"activation_mode.h", "activation_mode.h",
"batch_util.h", "batch_util.h",
"bcast.h", "bcast.h",
@ -437,6 +439,22 @@ cc_library(
alwayslink = 1, alwayslink = 1,
) )
cc_library(
name = "abstract_stack_trace",
hdrs = ["abstract_stack_trace.h"],
visibility = [
"//tensorflow/c/eager:__pkg__",
"//tensorflow/core:__pkg__",
"//tensorflow/core/common_runtime/eager:__pkg__",
"//tensorflow/core/platform:__pkg__",
"//tensorflow/python:__pkg__",
"//tensorflow/python/eager:__pkg__",
],
deps = [
"//tensorflow/core/platform:status",
],
)
tf_cuda_library( tf_cuda_library(
name = "gpu_cuda_alias", name = "gpu_cuda_alias",
hdrs = ["gpu_cuda_alias.h"], hdrs = ["gpu_cuda_alias.h"],

View File

@ -0,0 +1,44 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
#define TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
#include <string>
#include "tensorflow/core/platform/status.h"
namespace tensorflow {
// Language agnostic stack trace class. It only saves an id, and language
// clients are responsible for managing the actual stack trace objects.
class AbstractStackTrace {
public:
AbstractStackTrace(int id, std::vector<StackFrame> (*to_stack_frames)(int))
: id_(id), to_stack_frames_(to_stack_frames) {}
// Returns stack trace as a vector of `StackFrame`s.
std::vector<StackFrame> ToStackFrames() const {
return to_stack_frames_(id_);
}
private:
int id_;
std::vector<StackFrame> (*to_stack_frames_)(int);
};
} // namespace tensorflow
#endif // TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_

View File

@ -5664,9 +5664,11 @@ cc_library(
hdrs = ["util/stack_trace.h"], hdrs = ["util/stack_trace.h"],
deps = [ deps = [
":py_util", ":py_util",
"//tensorflow/core/platform:str_util",
"//tensorflow/core/platform:stringpiece",
"//tensorflow/core/util:abstract_stack_trace",
"//third_party/python_runtime:headers", # buildcleaner: keep "//third_party/python_runtime:headers", # buildcleaner: keep
"@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/synchronization",
"@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:optional",
], ],
) )

View File

@ -56,13 +56,16 @@ cc_library(
"//tensorflow/core/platform:logging", "//tensorflow/core/platform:logging",
"//tensorflow/core/platform:types", "//tensorflow/core/platform:types",
"//tensorflow/core/profiler/lib:traceme", "//tensorflow/core/profiler/lib:traceme",
"//tensorflow/core/util:abstract_stack_trace",
"//tensorflow/python:cpp_python_util", "//tensorflow/python:cpp_python_util",
"//tensorflow/python:ndarray_tensor", "//tensorflow/python:ndarray_tensor",
"//tensorflow/python:ndarray_tensor_bridge", "//tensorflow/python:ndarray_tensor_bridge",
"//tensorflow/python:numpy_lib", "//tensorflow/python:numpy_lib",
"//tensorflow/python:py_exception_registry", "//tensorflow/python:py_exception_registry",
"//tensorflow/python:py_seq_tensor", "//tensorflow/python:py_seq_tensor",
"//tensorflow/python:py_util",
"//tensorflow/python:safe_ptr", "//tensorflow/python:safe_ptr",
"//tensorflow/python:stack_trace",
"//third_party/py/numpy:headers", "//third_party/py/numpy:headers",
"//third_party/python_runtime:headers", "//third_party/python_runtime:headers",
"@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_map",

View File

@ -30,6 +30,7 @@ from tensorflow.python.eager import test
from tensorflow.python.framework import config from tensorflow.python.framework import config
from tensorflow.python.framework import constant_op from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes from tensorflow.python.framework import dtypes
from tensorflow.python.framework import errors_impl
from tensorflow.python.framework import ops from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import test_util from tensorflow.python.framework import test_util
@ -480,6 +481,24 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
self.assertIs(weak_x(), None) self.assertIs(weak_x(), None)
self.assertIs(weak_y(), None) self.assertIs(weak_y(), None)
def testAsyncExceptionStackTrace(self):
config.set_synchronous_execution(False)
def exception_originated_from_here():
# Invalid shapes for matmul.
return math_ops.matmul([[1]], [[2], [3]])
# In sync mode, an exception would have been raised here but since this is
# in async, the exception will be raised next.
x = exception_originated_from_here()
with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
'in exception_originated_from_here'):
x.numpy()
context.async_clear_error()
config.set_synchronous_execution(True)
if __name__ == '__main__': if __name__ == '__main__':
test.main() test.main()

View File

@ -41,10 +41,13 @@ limitations under the License.
#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/lib/traceme.h" #include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/core/util/abstract_stack_trace.h"
#include "tensorflow/python/eager/pywrap_gradient_exclusions.h" #include "tensorflow/python/eager/pywrap_gradient_exclusions.h"
#include "tensorflow/python/eager/pywrap_tensor.h" #include "tensorflow/python/eager/pywrap_tensor.h"
#include "tensorflow/python/eager/pywrap_tfe.h" #include "tensorflow/python/eager/pywrap_tfe.h"
#include "tensorflow/python/lib/core/py_util.h"
#include "tensorflow/python/lib/core/safe_ptr.h" #include "tensorflow/python/lib/core/safe_ptr.h"
#include "tensorflow/python/util/stack_trace.h"
#include "tensorflow/python/util/util.h" #include "tensorflow/python/util/util.h"
using tensorflow::string; using tensorflow::string;
@ -854,10 +857,14 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
TF_Status* out_status) { TF_Status* out_status) {
tensorflow::profiler::TraceMe activity( tensorflow::profiler::TraceMe activity(
"TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo); "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
TFE_Op* op = GetOp(ctx, op_name, device_name, out_status); TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); }); auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
if (!out_status->status.ok()) return; if (!out_status->status.ok()) return;
tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) { for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) {
TFE_OpAddInput(op, inputs->at(i), out_status); TFE_OpAddInput(op, inputs->at(i), out_status);
} }
@ -970,14 +977,54 @@ void RaiseFallbackException(const char* message) {
.data()); .data());
} }
// Format and return `status`' error message with the attached stack trace if
// available. `status` must have an error.
std::string FormatErrorStatusStackTrace(const tensorflow::Status& status) {
tensorflow::DCheckPyGilState();
DCHECK(!status.ok());
if (status.stack_trace().empty()) return status.error_message();
const std::vector<tensorflow::StackFrame>& stack_trace = status.stack_trace();
PyObject* linecache = PyImport_ImportModule("linecache");
PyObject* getline =
PyObject_GetAttr(linecache, PyUnicode_FromString("getline"));
DCHECK(getline);
std::ostringstream result;
result << "Exception originated from\n\n";
for (const tensorflow::StackFrame& stack_frame : stack_trace) {
PyObject* line_str_obj = PyObject_CallFunction(
getline, const_cast<char*>("si"), stack_frame.file_name.c_str(),
stack_frame.line_number);
tensorflow::StringPiece line_str = TFE_GetPythonString(line_str_obj);
tensorflow::str_util::RemoveWhitespaceContext(&line_str);
result << " File \"" << stack_frame.file_name << "\", line "
<< stack_frame.line_number << ", in " << stack_frame.function_name
<< '\n';
if (!line_str.empty()) result << " " << line_str << '\n';
Py_XDECREF(line_str_obj);
}
Py_DecRef(getline);
Py_DecRef(linecache);
result << '\n' << status.error_message();
return result.str();
}
int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) { int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
if (status->status.ok()) return 0; if (status->status.ok()) return 0;
const char* msg = TF_Message(status); const char* msg = TF_Message(status);
if (exception == nullptr) { if (exception == nullptr) {
tensorflow::mutex_lock l(exception_class_mutex); tensorflow::mutex_lock l(exception_class_mutex);
if (exception_class != nullptr) { if (exception_class != nullptr) {
tensorflow::Safe_PyObjectPtr val( tensorflow::Safe_PyObjectPtr val(Py_BuildValue(
Py_BuildValue("si", msg, TF_GetCode(status))); "si", FormatErrorStatusStackTrace(status->status).c_str(),
TF_GetCode(status)));
if (PyErr_Occurred()) { if (PyErr_Occurred()) {
// NOTE: This hides the actual error (i.e. the reason `status` was not // NOTE: This hides the actual error (i.e. the reason `status` was not
// TF_OK), but there is nothing we can do at this point since we can't // TF_OK), but there is nothing we can do at this point since we can't
@ -1003,7 +1050,8 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
if (exception == nullptr) { if (exception == nullptr) {
tensorflow::mutex_lock l(exception_class_mutex); tensorflow::mutex_lock l(exception_class_mutex);
if (exception_class != nullptr) { if (exception_class != nullptr) {
tensorflow::Safe_PyObjectPtr val(Py_BuildValue("si", msg, status.code())); tensorflow::Safe_PyObjectPtr val(Py_BuildValue(
"si", FormatErrorStatusStackTrace(status).c_str(), status.code()));
PyErr_SetObject(exception_class, val.get()); PyErr_SetObject(exception_class, val.get());
return -1; return -1;
} else { } else {
@ -3527,6 +3575,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
} }
TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status); TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status);
tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace());
auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] { auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] {
ReturnStatus(status); ReturnStatus(status);
ReturnOp(ctx, op); ReturnOp(ctx, op);
@ -3746,11 +3796,14 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
if (!status->status.ok()) { if (!status->status.ok()) {
// Augment the status with the op_name for easier debugging similar to // Augment the status with the op_name for easier debugging similar to
// TFE_Py_Execute. // TFE_Py_Execute.
TF_SetStatus(status, TF_GetCode(status), std::vector<tensorflow::StackFrame> stack_trace =
tensorflow::strings::StrCat( status->status.stack_trace();
TF_Message(status), status->status = tensorflow::Status(
" [Op:", TFE_GetPythonString(op_exec_info.op_name), "]") status->status.code(),
.c_str()); tensorflow::strings::StrCat(
TF_Message(status),
" [Op:", TFE_GetPythonString(op_exec_info.op_name), "]"),
std::move(stack_trace));
MaybeRaiseExceptionFromTFStatus(status, nullptr); MaybeRaiseExceptionFromTFStatus(status, nullptr);
return nullptr; return nullptr;

View File

@ -15,6 +15,9 @@ limitations under the License.
#include "tensorflow/python/util/stack_trace.h" #include "tensorflow/python/util/stack_trace.h"
#include "tensorflow/core/platform/str_util.h"
#include "tensorflow/core/platform/stringpiece.h"
namespace { namespace {
// Returns C string from a Python string object. Handles Python2/3 strings. // Returns C string from a Python string object. Handles Python2/3 strings.
@ -31,22 +34,33 @@ const char* GetPythonString(PyObject* o) {
return PyBytes_AsString(o); return PyBytes_AsString(o);
#endif #endif
} }
} // namespace } // namespace
namespace tensorflow { namespace tensorflow {
std::string StackTrace::ToString() const {
DCheckPyGilState();
std::ostringstream result; std::vector<StackFrame> StackTrace::ToStackFrames() const {
std::vector<StackFrame> result;
result.reserve(size_);
for (int i = size_ - 1; i >= 0; --i) { for (int i = size_ - 1; i >= 0; --i) {
result << " File \"" << PyUnicode_AsUTF8(code_objs_[i]->co_filename) const char* file_name = GetPythonString(code_objs_[i]->co_filename);
<< "\", line " const int line_number =
<< PyCode_Addr2Line(code_objs_[i], last_instructions_[i]) << ", in " PyCode_Addr2Line(code_objs_[i], last_instructions_[i]);
<< GetPythonString(code_objs_[i]->co_name) result.emplace_back(StackFrame{file_name, line_number,
<< "\n <source line unimplemented>\n"; GetPythonString(code_objs_[i]->co_name)});
// TODO(kkb): Add source code line. See tf_stack.cc's
// FrameSummary::line() function.
} }
return result.str();
return result;
} }
StackTrace* StackTraceManager::Get(int id) {
DCheckPyGilState();
if (next_id_ - id > kStackTraceCircularBufferSize) return nullptr;
return &stack_traces_[id & (kStackTraceCircularBufferSize - 1)];
}
StackTraceManager* const stack_trace_manager = new StackTraceManager();
} // namespace tensorflow } // namespace tensorflow

View File

@ -25,6 +25,8 @@ limitations under the License.
#include "absl/base/attributes.h" #include "absl/base/attributes.h"
#include "absl/base/optimization.h" #include "absl/base/optimization.h"
#include "absl/types/optional.h"
#include "tensorflow/core/util/abstract_stack_trace.h"
#include "tensorflow/python/lib/core/py_util.h" #include "tensorflow/python/lib/core/py_util.h"
namespace tensorflow { namespace tensorflow {
@ -82,10 +84,8 @@ class StackTrace final {
return *this; return *this;
} }
// Returns string representation of the captured stack trace. // Returns a structured representation of the captured stack trace.
std::string ToString() const; std::vector<StackFrame> ToStackFrames() const;
// TODO(kkb): Implement structured stack trace object getter.
private: private:
std::array<PyCodeObject*, kMaxDepth> code_objs_; std::array<PyCodeObject*, kMaxDepth> code_objs_;
@ -103,6 +103,53 @@ class StackTrace final {
StackTrace& operator=(const StackTrace&) = delete; StackTrace& operator=(const StackTrace&) = delete;
}; };
// A class that manages Python stack traces in a circular buffer. Users can
// insert stack trace entries and retrive them by ids.
class StackTraceManager {
public:
static constexpr int kStackTraceCircularBufferSize = 1024;
// Captures the current Python stack trace and returns an id.
// Python GIL must be acquired beforehand.
ABSL_MUST_USE_RESULT
ABSL_ATTRIBUTE_HOT
int Capture() {
DCheckPyGilState();
const int id = next_id_++;
const int index = id & (kStackTraceCircularBufferSize - 1);
stack_traces_[index] = StackTrace::Capture();
return id;
}
// Retrieve captured Python stack trace by id. Returns `nullptr` if the
// requested stack trace is evicted from the circular buffer.
// Python GIL must be acquired beforehand.
ABSL_MUST_USE_RESULT
StackTrace* Get(int id);
private:
int next_id_ = 0;
std::array<StackTrace, kStackTraceCircularBufferSize> stack_traces_;
};
// Singleton StackTraceManager.
extern StackTraceManager* const stack_trace_manager;
// Returns Python stack trace object that can be converted to string.
// Note that the actual stack trace is kept in a circular buffer for string
// conversion could fail if it's evicted before.
// Python GIL must be acquired beforehand.
inline AbstractStackTrace GetStackTrace() {
DCheckPyGilState();
return AbstractStackTrace(stack_trace_manager->Capture(), [](int id) {
PyGILState_STATE gstate = PyGILState_Ensure();
std::vector<StackFrame> result =
stack_trace_manager->Get(id)->ToStackFrames();
PyGILState_Release(gstate);
return result;
});
}
} // namespace tensorflow } // namespace tensorflow
#endif // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_ #endif // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_