Implement async Python stack trace error reporting.
PiperOrigin-RevId: 322211018 Change-Id: I0d02e98748cc79cca04dc71fc828990548fce70f
This commit is contained in:
		
							parent
							
								
									6ee2d328fd
								
							
						
					
					
						commit
						fa85309fb5
					
				| @ -308,6 +308,7 @@ cc_library( | |||||||
|         "//tensorflow/core:framework", |         "//tensorflow/core:framework", | ||||||
|         "//tensorflow/core:lib", |         "//tensorflow/core:lib", | ||||||
|         "//tensorflow/core:protos_all_cc", |         "//tensorflow/core:protos_all_cc", | ||||||
|  |         "//tensorflow/core/util:abstract_stack_trace", | ||||||
|         "@com_google_absl//absl/types:span", |         "@com_google_absl//absl/types:span", | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  | |||||||
| @ -26,6 +26,7 @@ limitations under the License. | |||||||
| #include "tensorflow/core/framework/types.pb.h" | #include "tensorflow/core/framework/types.pb.h" | ||||||
| #include "tensorflow/core/platform/casts.h" | #include "tensorflow/core/platform/casts.h" | ||||||
| #include "tensorflow/core/platform/status.h" | #include "tensorflow/core/platform/status.h" | ||||||
|  | #include "tensorflow/core/util/abstract_stack_trace.h" | ||||||
| 
 | 
 | ||||||
| struct TFE_Op; | struct TFE_Op; | ||||||
| 
 | 
 | ||||||
| @ -44,6 +45,12 @@ class ImmediateExecutionOperation : public AbstractOperation { | |||||||
|   // Experimental
 |   // Experimental
 | ||||||
|   virtual Status SetUseXla(bool enable) = 0; |   virtual Status SetUseXla(bool enable) = 0; | ||||||
| 
 | 
 | ||||||
|  |   // Set stack trace to be used for potential async error reporting.
 | ||||||
|  |   virtual void SetStackTrace(AbstractStackTrace stack_trace) = 0; | ||||||
|  | 
 | ||||||
|  |   // Returns the stack trace set by `SetStackTrace` if exists.
 | ||||||
|  |   virtual absl::optional<AbstractStackTrace> GetStackTrace() = 0; | ||||||
|  | 
 | ||||||
|   // For LLVM style RTTI.
 |   // For LLVM style RTTI.
 | ||||||
|   static bool classof(const AbstractOperation* ptr) { |   static bool classof(const AbstractOperation* ptr) { | ||||||
|     return ptr->getKind() == kEager || ptr->getKind() == kTfrt; |     return ptr->getKind() == kEager || ptr->getKind() == kTfrt; | ||||||
|  | |||||||
| @ -163,6 +163,7 @@ tf_cuda_library( | |||||||
|         "//tensorflow/core:protos_all_cc", |         "//tensorflow/core:protos_all_cc", | ||||||
|         "//tensorflow/core:lib", |         "//tensorflow/core:lib", | ||||||
|         "//tensorflow/core/platform:platform_port", |         "//tensorflow/core/platform:platform_port", | ||||||
|  |         "//tensorflow/core/util:abstract_stack_trace", | ||||||
|     ] + select({ |     ] + select({ | ||||||
|         "//tensorflow:android": [ |         "//tensorflow:android": [ | ||||||
|             "//tensorflow/core:portable_tensorflow_lib_lite", |             "//tensorflow/core:portable_tensorflow_lib_lite", | ||||||
|  | |||||||
| @ -306,6 +306,7 @@ Status EagerOperation::Reset( | |||||||
|   } |   } | ||||||
|   attrs_.Reset(op); |   attrs_.Reset(op); | ||||||
|   use_xla_ = false; |   use_xla_ = false; | ||||||
|  |   stack_trace_.reset(); | ||||||
|   is_function_ = is_function; |   is_function_ = is_function; | ||||||
|   cancellation_manager_ = nullptr; |   cancellation_manager_ = nullptr; | ||||||
|   executor_ = executor ? executor : &ctx_.Executor(); |   executor_ = executor ? executor : &ctx_.Executor(); | ||||||
|  | |||||||
| @ -29,6 +29,7 @@ limitations under the License. | |||||||
| #include "tensorflow/core/framework/cancellation.h" | #include "tensorflow/core/framework/cancellation.h" | ||||||
| #include "tensorflow/core/framework/device_attributes.pb.h" | #include "tensorflow/core/framework/device_attributes.pb.h" | ||||||
| #include "tensorflow/core/framework/op_def.pb.h" | #include "tensorflow/core/framework/op_def.pb.h" | ||||||
|  | #include "tensorflow/core/util/abstract_stack_trace.h" | ||||||
| #include "tensorflow/core/util/device_name_utils.h" | #include "tensorflow/core/util/device_name_utils.h" | ||||||
| 
 | 
 | ||||||
| namespace tensorflow { | namespace tensorflow { | ||||||
| @ -120,6 +121,14 @@ class EagerOperation : public ImmediateExecutionOperation { | |||||||
| 
 | 
 | ||||||
|   Status SetUseXla(bool enable) override; |   Status SetUseXla(bool enable) override; | ||||||
| 
 | 
 | ||||||
|  |   void SetStackTrace(AbstractStackTrace stack_trace) override { | ||||||
|  |     stack_trace_ = stack_trace; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   absl::optional<AbstractStackTrace> GetStackTrace() override { | ||||||
|  |     return stack_trace_; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   Status Reset(const char* op, const char* device_name, bool remote, |   Status Reset(const char* op, const char* device_name, bool remote, | ||||||
|                EagerExecutor* executor, |                EagerExecutor* executor, | ||||||
|                const absl::optional<EagerRemoteFunctionParams> |                const absl::optional<EagerRemoteFunctionParams> | ||||||
| @ -218,6 +227,7 @@ class EagerOperation : public ImmediateExecutionOperation { | |||||||
|   VariantDevice device_; |   VariantDevice device_; | ||||||
| 
 | 
 | ||||||
|   bool use_xla_ = false; |   bool use_xla_ = false; | ||||||
|  |   absl::optional<AbstractStackTrace> stack_trace_; | ||||||
|   bool is_function_;  // Conceptually const, but can't be because of Reset
 |   bool is_function_;  // Conceptually const, but can't be because of Reset
 | ||||||
|   bool colocation_exempt_; |   bool colocation_exempt_; | ||||||
|   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
 |   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
 | ||||||
|  | |||||||
| @ -634,7 +634,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals, | |||||||
|     auto node = absl::make_unique<AsyncExecuteNode>( |     auto node = absl::make_unique<AsyncExecuteNode>( | ||||||
|         &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel), |         &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel), | ||||||
|         graph_collector, op->GetCancellationManager(), |         graph_collector, op->GetCancellationManager(), | ||||||
|         absl::Span<TensorHandle*>(retvals, num_outputs)); |         absl::Span<TensorHandle*>(retvals, num_outputs), op->GetStackTrace()); | ||||||
|     // Release the inputs from the eager operation since the AsyncExecuteNode
 |     // Release the inputs from the eager operation since the AsyncExecuteNode
 | ||||||
|     // would have taken ownership. This allows the inputs to be forwarded if
 |     // would have taken ownership. This allows the inputs to be forwarded if
 | ||||||
|     // possible.
 |     // possible.
 | ||||||
|  | |||||||
| @ -150,14 +150,16 @@ class AsyncExecuteNode : public EagerNode { | |||||||
|       core::RefCountPtr<KernelAndDevice> kernel, |       core::RefCountPtr<KernelAndDevice> kernel, | ||||||
|       GraphCollector* graph_collector, |       GraphCollector* graph_collector, | ||||||
|       CancellationManager* cancellation_manager, |       CancellationManager* cancellation_manager, | ||||||
|       absl::Span<TensorHandle*> retvals) |       absl::Span<TensorHandle*> retvals, | ||||||
|  |       absl::optional<AbstractStackTrace> stack_trace) | ||||||
|       : EagerNode(), |       : EagerNode(), | ||||||
|         ctx_(ctx), |         ctx_(ctx), | ||||||
|         inputs_(inputs), |         inputs_(inputs), | ||||||
|         remote_func_params_(remote_func_params), |         remote_func_params_(remote_func_params), | ||||||
|         kernel_(std::move(kernel)), |         kernel_(std::move(kernel)), | ||||||
|         graph_collector_(graph_collector), |         graph_collector_(graph_collector), | ||||||
|         cancellation_manager_(cancellation_manager) { |         cancellation_manager_(cancellation_manager), | ||||||
|  |         stack_trace_(stack_trace) { | ||||||
|     // Copy the output handles, since the container for them might get
 |     // Copy the output handles, since the container for them might get
 | ||||||
|     // destroyed.
 |     // destroyed.
 | ||||||
|     for (auto handle : retvals) { |     for (auto handle : retvals) { | ||||||
| @ -194,10 +196,14 @@ class AsyncExecuteNode : public EagerNode { | |||||||
|       } |       } | ||||||
|       ++i; |       ++i; | ||||||
|     } |     } | ||||||
|     const Status status = EagerKernelExecute( |     Status status = EagerKernelExecute( | ||||||
|         ctx_, inputs_, remote_func_params_, kernel_, graph_collector_, |         ctx_, inputs_, remote_func_params_, kernel_, graph_collector_, | ||||||
|         cancellation_manager_, absl::MakeSpan(retvals_)); |         cancellation_manager_, absl::MakeSpan(retvals_)); | ||||||
|     if (!status.ok()) { |     if (!status.ok()) { | ||||||
|  |       if (stack_trace_.has_value()) { | ||||||
|  |         status = Status(status.code(), status.error_message(), | ||||||
|  |                         stack_trace_->ToStackFrames()); | ||||||
|  |       } | ||||||
|       Abort(status); |       Abort(status); | ||||||
|       return status; |       return status; | ||||||
|     } |     } | ||||||
| @ -227,6 +233,7 @@ class AsyncExecuteNode : public EagerNode { | |||||||
|   core::RefCountPtr<KernelAndDevice> kernel_; |   core::RefCountPtr<KernelAndDevice> kernel_; | ||||||
|   GraphCollector* graph_collector_; |   GraphCollector* graph_collector_; | ||||||
|   CancellationManager* const cancellation_manager_; |   CancellationManager* const cancellation_manager_; | ||||||
|  |   absl::optional<AbstractStackTrace> stack_trace_; | ||||||
|   absl::InlinedVector<TensorHandle*, 2> retvals_; |   absl::InlinedVector<TensorHandle*, 2> retvals_; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -62,9 +62,11 @@ inline const strings::AlphaNum& PrepareForStrCat(const strings::AlphaNum& a) { | |||||||
| // to be several layers of additional context.
 | // to be several layers of additional context.
 | ||||||
| template <typename... Args> | template <typename... Args> | ||||||
| void AppendToMessage(::tensorflow::Status* status, Args... args) { | void AppendToMessage(::tensorflow::Status* status, Args... args) { | ||||||
|  |   std::vector<StackFrame> stack_trace = status->stack_trace(); | ||||||
|   *status = ::tensorflow::Status( |   *status = ::tensorflow::Status( | ||||||
|       status->code(), |       status->code(), | ||||||
|       ::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...)); |       ::tensorflow::strings::StrCat(status->error_message(), "\n\t", args...), | ||||||
|  |       std::move(stack_trace)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // For propagating errors when calling a function.
 | // For propagating errors when calling a function.
 | ||||||
|  | |||||||
| @ -89,11 +89,13 @@ class StatusLogSink : public TFLogSink { | |||||||
| 
 | 
 | ||||||
| }  // namespace
 | }  // namespace
 | ||||||
| 
 | 
 | ||||||
| Status::Status(tensorflow::error::Code code, StringPiece msg) { | Status::Status(tensorflow::error::Code code, tensorflow::StringPiece msg, | ||||||
|  |                std::vector<StackFrame>&& stack_trace) { | ||||||
|   assert(code != tensorflow::error::OK); |   assert(code != tensorflow::error::OK); | ||||||
|   state_ = std::unique_ptr<State>(new State); |   state_ = std::unique_ptr<State>(new State); | ||||||
|   state_->code = code; |   state_->code = code; | ||||||
|   state_->msg = string(msg); |   state_->msg = string(msg); | ||||||
|  |   state_->stack_trace = std::move(stack_trace); | ||||||
|   VLOG(5) << "Generated non-OK status: \"" << *this << "\". " |   VLOG(5) << "Generated non-OK status: \"" << *this << "\". " | ||||||
|           << CurrentStackTrace(); |           << CurrentStackTrace(); | ||||||
| } | } | ||||||
| @ -117,6 +119,11 @@ const string& Status::empty_string() { | |||||||
|   return *empty; |   return *empty; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | const std::vector<StackFrame>& Status::empty_stack_trace() { | ||||||
|  |   static std::vector<StackFrame>* empty = new std::vector<StackFrame>(); | ||||||
|  |   return *empty; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| string error_name(error::Code code) { | string error_name(error::Code code) { | ||||||
|   switch (code) { |   switch (code) { | ||||||
|     case tensorflow::error::OK: |     case tensorflow::error::OK: | ||||||
|  | |||||||
| @ -29,6 +29,13 @@ limitations under the License. | |||||||
| 
 | 
 | ||||||
| namespace tensorflow { | namespace tensorflow { | ||||||
| 
 | 
 | ||||||
|  | // A struct representing a frame in a stack trace.
 | ||||||
|  | struct StackFrame { | ||||||
|  |   std::string file_name; | ||||||
|  |   int line_number; | ||||||
|  |   std::string function_name; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| #if defined(__clang__) | #if defined(__clang__) | ||||||
| // Only clang supports warn_unused_result as a type annotation.
 | // Only clang supports warn_unused_result as a type annotation.
 | ||||||
| class TF_MUST_USE_RESULT Status; | class TF_MUST_USE_RESULT Status; | ||||||
| @ -43,7 +50,15 @@ class Status { | |||||||
| 
 | 
 | ||||||
|   /// \brief Create a status with the specified error code and msg as a
 |   /// \brief Create a status with the specified error code and msg as a
 | ||||||
|   /// human-readable string containing more detailed information.
 |   /// human-readable string containing more detailed information.
 | ||||||
|   Status(tensorflow::error::Code code, tensorflow::StringPiece msg); |   Status(tensorflow::error::Code code, tensorflow::StringPiece msg) | ||||||
|  |       : Status(code, msg, {}) {} | ||||||
|  | 
 | ||||||
|  |   /// \brief Create a status with the specified error code, msg, and stack trace
 | ||||||
|  |   /// as a human-readable string containing more detailed information.
 | ||||||
|  | #ifndef SWIG | ||||||
|  |   Status(tensorflow::error::Code code, tensorflow::StringPiece msg, | ||||||
|  |          std::vector<StackFrame>&& stack_trace); | ||||||
|  | #endif | ||||||
| 
 | 
 | ||||||
|   /// Copy the specified status.
 |   /// Copy the specified status.
 | ||||||
|   Status(const Status& s); |   Status(const Status& s); | ||||||
| @ -66,6 +81,10 @@ class Status { | |||||||
|     return ok() ? empty_string() : state_->msg; |     return ok() ? empty_string() : state_->msg; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   const std::vector<StackFrame>& stack_trace() const { | ||||||
|  |     return ok() ? empty_stack_trace() : state_->stack_trace; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|   bool operator==(const Status& x) const; |   bool operator==(const Status& x) const; | ||||||
|   bool operator!=(const Status& x) const; |   bool operator!=(const Status& x) const; | ||||||
| 
 | 
 | ||||||
| @ -91,9 +110,11 @@ class Status { | |||||||
| 
 | 
 | ||||||
|  private: |  private: | ||||||
|   static const std::string& empty_string(); |   static const std::string& empty_string(); | ||||||
|  |   static const std::vector<StackFrame>& empty_stack_trace(); | ||||||
|   struct State { |   struct State { | ||||||
|     tensorflow::error::Code code; |     tensorflow::error::Code code; | ||||||
|     std::string msg; |     std::string msg; | ||||||
|  |     std::vector<StackFrame> stack_trace; | ||||||
|   }; |   }; | ||||||
|   // OK status has a `NULL` state_.  Otherwise, `state_` points to
 |   // OK status has a `NULL` state_.  Otherwise, `state_` points to
 | ||||||
|   // a `State` structure containing the error code and message(s)
 |   // a `State` structure containing the error code and message(s)
 | ||||||
|  | |||||||
| @ -61,6 +61,7 @@ filegroup( | |||||||
| filegroup( | filegroup( | ||||||
|     name = "mobile_srcs_only_runtime", |     name = "mobile_srcs_only_runtime", | ||||||
|     srcs = [ |     srcs = [ | ||||||
|  |         "abstract_stack_trace.h", | ||||||
|         "batch_util.cc", |         "batch_util.cc", | ||||||
|         "batch_util.h", |         "batch_util.h", | ||||||
|         "bcast.cc", |         "bcast.cc", | ||||||
| @ -313,6 +314,7 @@ filegroup( | |||||||
| filegroup( | filegroup( | ||||||
|     name = "framework_srcs", |     name = "framework_srcs", | ||||||
|     srcs = [ |     srcs = [ | ||||||
|  |         "abstract_stack_trace.h", | ||||||
|         "activation_mode.h", |         "activation_mode.h", | ||||||
|         "batch_util.h", |         "batch_util.h", | ||||||
|         "bcast.h", |         "bcast.h", | ||||||
| @ -437,6 +439,22 @@ cc_library( | |||||||
|     alwayslink = 1, |     alwayslink = 1, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
|  | cc_library( | ||||||
|  |     name = "abstract_stack_trace", | ||||||
|  |     hdrs = ["abstract_stack_trace.h"], | ||||||
|  |     visibility = [ | ||||||
|  |         "//tensorflow/c/eager:__pkg__", | ||||||
|  |         "//tensorflow/core:__pkg__", | ||||||
|  |         "//tensorflow/core/common_runtime/eager:__pkg__", | ||||||
|  |         "//tensorflow/core/platform:__pkg__", | ||||||
|  |         "//tensorflow/python:__pkg__", | ||||||
|  |         "//tensorflow/python/eager:__pkg__", | ||||||
|  |     ], | ||||||
|  |     deps = [ | ||||||
|  |         "//tensorflow/core/platform:status", | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  | 
 | ||||||
| tf_cuda_library( | tf_cuda_library( | ||||||
|     name = "gpu_cuda_alias", |     name = "gpu_cuda_alias", | ||||||
|     hdrs = ["gpu_cuda_alias.h"], |     hdrs = ["gpu_cuda_alias.h"], | ||||||
|  | |||||||
							
								
								
									
										44
									
								
								tensorflow/core/util/abstract_stack_trace.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								tensorflow/core/util/abstract_stack_trace.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,44 @@ | |||||||
|  | /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 | ||||||
|  | 
 | ||||||
|  | Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
|  | you may not use this file except in compliance with the License. | ||||||
|  | You may obtain a copy of the License at | ||||||
|  | 
 | ||||||
|  |     http://www.apache.org/licenses/LICENSE-2.0
 | ||||||
|  | 
 | ||||||
|  | Unless required by applicable law or agreed to in writing, software | ||||||
|  | distributed under the License is distributed on an "AS IS" BASIS, | ||||||
|  | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
|  | See the License for the specific language governing permissions and | ||||||
|  | limitations under the License. | ||||||
|  | ==============================================================================*/ | ||||||
|  | 
 | ||||||
|  | #ifndef TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_ | ||||||
|  | #define TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_ | ||||||
|  | 
 | ||||||
|  | #include <string> | ||||||
|  | 
 | ||||||
|  | #include "tensorflow/core/platform/status.h" | ||||||
|  | 
 | ||||||
|  | namespace tensorflow { | ||||||
|  | 
 | ||||||
|  | // Language agnostic stack trace class. It only saves an id, and language
 | ||||||
|  | // clients are responsible for managing the actual stack trace objects.
 | ||||||
|  | class AbstractStackTrace { | ||||||
|  |  public: | ||||||
|  |   AbstractStackTrace(int id, std::vector<StackFrame> (*to_stack_frames)(int)) | ||||||
|  |       : id_(id), to_stack_frames_(to_stack_frames) {} | ||||||
|  | 
 | ||||||
|  |   // Returns stack trace as a vector of `StackFrame`s.
 | ||||||
|  |   std::vector<StackFrame> ToStackFrames() const { | ||||||
|  |     return to_stack_frames_(id_); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   int id_; | ||||||
|  |   std::vector<StackFrame> (*to_stack_frames_)(int); | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | }  // namespace tensorflow
 | ||||||
|  | 
 | ||||||
|  | #endif  // TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
 | ||||||
| @ -5664,9 +5664,11 @@ cc_library( | |||||||
|     hdrs = ["util/stack_trace.h"], |     hdrs = ["util/stack_trace.h"], | ||||||
|     deps = [ |     deps = [ | ||||||
|         ":py_util", |         ":py_util", | ||||||
|  |         "//tensorflow/core/platform:str_util", | ||||||
|  |         "//tensorflow/core/platform:stringpiece", | ||||||
|  |         "//tensorflow/core/util:abstract_stack_trace", | ||||||
|         "//third_party/python_runtime:headers",  # buildcleaner: keep |         "//third_party/python_runtime:headers",  # buildcleaner: keep | ||||||
|         "@com_google_absl//absl/base:core_headers", |         "@com_google_absl//absl/base:core_headers", | ||||||
|         "@com_google_absl//absl/synchronization", |  | ||||||
|         "@com_google_absl//absl/types:optional", |         "@com_google_absl//absl/types:optional", | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  | |||||||
| @ -56,13 +56,16 @@ cc_library( | |||||||
|         "//tensorflow/core/platform:logging", |         "//tensorflow/core/platform:logging", | ||||||
|         "//tensorflow/core/platform:types", |         "//tensorflow/core/platform:types", | ||||||
|         "//tensorflow/core/profiler/lib:traceme", |         "//tensorflow/core/profiler/lib:traceme", | ||||||
|  |         "//tensorflow/core/util:abstract_stack_trace", | ||||||
|         "//tensorflow/python:cpp_python_util", |         "//tensorflow/python:cpp_python_util", | ||||||
|         "//tensorflow/python:ndarray_tensor", |         "//tensorflow/python:ndarray_tensor", | ||||||
|         "//tensorflow/python:ndarray_tensor_bridge", |         "//tensorflow/python:ndarray_tensor_bridge", | ||||||
|         "//tensorflow/python:numpy_lib", |         "//tensorflow/python:numpy_lib", | ||||||
|         "//tensorflow/python:py_exception_registry", |         "//tensorflow/python:py_exception_registry", | ||||||
|         "//tensorflow/python:py_seq_tensor", |         "//tensorflow/python:py_seq_tensor", | ||||||
|  |         "//tensorflow/python:py_util", | ||||||
|         "//tensorflow/python:safe_ptr", |         "//tensorflow/python:safe_ptr", | ||||||
|  |         "//tensorflow/python:stack_trace", | ||||||
|         "//third_party/py/numpy:headers", |         "//third_party/py/numpy:headers", | ||||||
|         "//third_party/python_runtime:headers", |         "//third_party/python_runtime:headers", | ||||||
|         "@com_google_absl//absl/container:flat_hash_map", |         "@com_google_absl//absl/container:flat_hash_map", | ||||||
|  | |||||||
| @ -30,6 +30,7 @@ from tensorflow.python.eager import test | |||||||
| from tensorflow.python.framework import config | from tensorflow.python.framework import config | ||||||
| from tensorflow.python.framework import constant_op | from tensorflow.python.framework import constant_op | ||||||
| from tensorflow.python.framework import dtypes | from tensorflow.python.framework import dtypes | ||||||
|  | from tensorflow.python.framework import errors_impl | ||||||
| from tensorflow.python.framework import ops | from tensorflow.python.framework import ops | ||||||
| from tensorflow.python.framework import tensor_shape | from tensorflow.python.framework import tensor_shape | ||||||
| from tensorflow.python.framework import test_util | from tensorflow.python.framework import test_util | ||||||
| @ -480,6 +481,24 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): | |||||||
|     self.assertIs(weak_x(), None) |     self.assertIs(weak_x(), None) | ||||||
|     self.assertIs(weak_y(), None) |     self.assertIs(weak_y(), None) | ||||||
| 
 | 
 | ||||||
|  |   def testAsyncExceptionStackTrace(self): | ||||||
|  |     config.set_synchronous_execution(False) | ||||||
|  | 
 | ||||||
|  |     def exception_originated_from_here(): | ||||||
|  |       # Invalid shapes for matmul. | ||||||
|  |       return math_ops.matmul([[1]], [[2], [3]]) | ||||||
|  | 
 | ||||||
|  |     # In sync mode, an exception would have been raised here but since this is | ||||||
|  |     # in async, the exception will be raised next. | ||||||
|  |     x = exception_originated_from_here() | ||||||
|  | 
 | ||||||
|  |     with self.assertRaisesRegex(errors_impl.InvalidArgumentError, | ||||||
|  |                                 'in exception_originated_from_here'): | ||||||
|  |       x.numpy() | ||||||
|  | 
 | ||||||
|  |     context.async_clear_error() | ||||||
|  |     config.set_synchronous_execution(True) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|   test.main() |   test.main() | ||||||
|  | |||||||
| @ -41,10 +41,13 @@ limitations under the License. | |||||||
| #include "tensorflow/core/platform/protobuf.h" | #include "tensorflow/core/platform/protobuf.h" | ||||||
| #include "tensorflow/core/platform/types.h" | #include "tensorflow/core/platform/types.h" | ||||||
| #include "tensorflow/core/profiler/lib/traceme.h" | #include "tensorflow/core/profiler/lib/traceme.h" | ||||||
|  | #include "tensorflow/core/util/abstract_stack_trace.h" | ||||||
| #include "tensorflow/python/eager/pywrap_gradient_exclusions.h" | #include "tensorflow/python/eager/pywrap_gradient_exclusions.h" | ||||||
| #include "tensorflow/python/eager/pywrap_tensor.h" | #include "tensorflow/python/eager/pywrap_tensor.h" | ||||||
| #include "tensorflow/python/eager/pywrap_tfe.h" | #include "tensorflow/python/eager/pywrap_tfe.h" | ||||||
|  | #include "tensorflow/python/lib/core/py_util.h" | ||||||
| #include "tensorflow/python/lib/core/safe_ptr.h" | #include "tensorflow/python/lib/core/safe_ptr.h" | ||||||
|  | #include "tensorflow/python/util/stack_trace.h" | ||||||
| #include "tensorflow/python/util/util.h" | #include "tensorflow/python/util/util.h" | ||||||
| 
 | 
 | ||||||
| using tensorflow::string; | using tensorflow::string; | ||||||
| @ -854,10 +857,14 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name, | |||||||
|                               TF_Status* out_status) { |                               TF_Status* out_status) { | ||||||
|   tensorflow::profiler::TraceMe activity( |   tensorflow::profiler::TraceMe activity( | ||||||
|       "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo); |       "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo); | ||||||
|  | 
 | ||||||
|   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status); |   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status); | ||||||
|  | 
 | ||||||
|   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); }); |   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); }); | ||||||
|   if (!out_status->status.ok()) return; |   if (!out_status->status.ok()) return; | ||||||
| 
 | 
 | ||||||
|  |   tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace()); | ||||||
|  | 
 | ||||||
|   for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) { |   for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) { | ||||||
|     TFE_OpAddInput(op, inputs->at(i), out_status); |     TFE_OpAddInput(op, inputs->at(i), out_status); | ||||||
|   } |   } | ||||||
| @ -970,14 +977,54 @@ void RaiseFallbackException(const char* message) { | |||||||
|           .data()); |           .data()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // Format and return `status`' error message with the attached stack trace if
 | ||||||
|  | // available. `status` must have an error.
 | ||||||
|  | std::string FormatErrorStatusStackTrace(const tensorflow::Status& status) { | ||||||
|  |   tensorflow::DCheckPyGilState(); | ||||||
|  |   DCHECK(!status.ok()); | ||||||
|  | 
 | ||||||
|  |   if (status.stack_trace().empty()) return status.error_message(); | ||||||
|  | 
 | ||||||
|  |   const std::vector<tensorflow::StackFrame>& stack_trace = status.stack_trace(); | ||||||
|  | 
 | ||||||
|  |   PyObject* linecache = PyImport_ImportModule("linecache"); | ||||||
|  |   PyObject* getline = | ||||||
|  |       PyObject_GetAttr(linecache, PyUnicode_FromString("getline")); | ||||||
|  |   DCHECK(getline); | ||||||
|  | 
 | ||||||
|  |   std::ostringstream result; | ||||||
|  |   result << "Exception originated from\n\n"; | ||||||
|  | 
 | ||||||
|  |   for (const tensorflow::StackFrame& stack_frame : stack_trace) { | ||||||
|  |     PyObject* line_str_obj = PyObject_CallFunction( | ||||||
|  |         getline, const_cast<char*>("si"), stack_frame.file_name.c_str(), | ||||||
|  |         stack_frame.line_number); | ||||||
|  |     tensorflow::StringPiece line_str = TFE_GetPythonString(line_str_obj); | ||||||
|  |     tensorflow::str_util::RemoveWhitespaceContext(&line_str); | ||||||
|  |     result << "  File \"" << stack_frame.file_name << "\", line " | ||||||
|  |            << stack_frame.line_number << ", in " << stack_frame.function_name | ||||||
|  |            << '\n'; | ||||||
|  | 
 | ||||||
|  |     if (!line_str.empty()) result << "    " << line_str << '\n'; | ||||||
|  |     Py_XDECREF(line_str_obj); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   Py_DecRef(getline); | ||||||
|  |   Py_DecRef(linecache); | ||||||
|  | 
 | ||||||
|  |   result << '\n' << status.error_message(); | ||||||
|  |   return result.str(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) { | int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) { | ||||||
|   if (status->status.ok()) return 0; |   if (status->status.ok()) return 0; | ||||||
|   const char* msg = TF_Message(status); |   const char* msg = TF_Message(status); | ||||||
|   if (exception == nullptr) { |   if (exception == nullptr) { | ||||||
|     tensorflow::mutex_lock l(exception_class_mutex); |     tensorflow::mutex_lock l(exception_class_mutex); | ||||||
|     if (exception_class != nullptr) { |     if (exception_class != nullptr) { | ||||||
|       tensorflow::Safe_PyObjectPtr val( |       tensorflow::Safe_PyObjectPtr val(Py_BuildValue( | ||||||
|           Py_BuildValue("si", msg, TF_GetCode(status))); |           "si", FormatErrorStatusStackTrace(status->status).c_str(), | ||||||
|  |           TF_GetCode(status))); | ||||||
|       if (PyErr_Occurred()) { |       if (PyErr_Occurred()) { | ||||||
|         // NOTE: This hides the actual error (i.e. the reason `status` was not
 |         // NOTE: This hides the actual error (i.e. the reason `status` was not
 | ||||||
|         // TF_OK), but there is nothing we can do at this point since we can't
 |         // TF_OK), but there is nothing we can do at this point since we can't
 | ||||||
| @ -1003,7 +1050,8 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status, | |||||||
|   if (exception == nullptr) { |   if (exception == nullptr) { | ||||||
|     tensorflow::mutex_lock l(exception_class_mutex); |     tensorflow::mutex_lock l(exception_class_mutex); | ||||||
|     if (exception_class != nullptr) { |     if (exception_class != nullptr) { | ||||||
|       tensorflow::Safe_PyObjectPtr val(Py_BuildValue("si", msg, status.code())); |       tensorflow::Safe_PyObjectPtr val(Py_BuildValue( | ||||||
|  |           "si", FormatErrorStatusStackTrace(status).c_str(), status.code())); | ||||||
|       PyErr_SetObject(exception_class, val.get()); |       PyErr_SetObject(exception_class, val.get()); | ||||||
|       return -1; |       return -1; | ||||||
|     } else { |     } else { | ||||||
| @ -3527,6 +3575,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) { | |||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status); |   TFE_Op* op = GetOp(ctx, op_name, op_exec_info.device_name, status); | ||||||
|  |   tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace()); | ||||||
|  | 
 | ||||||
|   auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] { |   auto cleaner = tensorflow::gtl::MakeCleanup([status, ctx, op] { | ||||||
|     ReturnStatus(status); |     ReturnStatus(status); | ||||||
|     ReturnOp(ctx, op); |     ReturnOp(ctx, op); | ||||||
| @ -3746,11 +3796,14 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) { | |||||||
|   if (!status->status.ok()) { |   if (!status->status.ok()) { | ||||||
|     // Augment the status with the op_name for easier debugging similar to
 |     // Augment the status with the op_name for easier debugging similar to
 | ||||||
|     // TFE_Py_Execute.
 |     // TFE_Py_Execute.
 | ||||||
|     TF_SetStatus(status, TF_GetCode(status), |     std::vector<tensorflow::StackFrame> stack_trace = | ||||||
|                  tensorflow::strings::StrCat( |         status->status.stack_trace(); | ||||||
|                      TF_Message(status), |     status->status = tensorflow::Status( | ||||||
|                      " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]") |         status->status.code(), | ||||||
|                      .c_str()); |         tensorflow::strings::StrCat( | ||||||
|  |             TF_Message(status), | ||||||
|  |             " [Op:", TFE_GetPythonString(op_exec_info.op_name), "]"), | ||||||
|  |         std::move(stack_trace)); | ||||||
| 
 | 
 | ||||||
|     MaybeRaiseExceptionFromTFStatus(status, nullptr); |     MaybeRaiseExceptionFromTFStatus(status, nullptr); | ||||||
|     return nullptr; |     return nullptr; | ||||||
|  | |||||||
| @ -15,6 +15,9 @@ limitations under the License. | |||||||
| 
 | 
 | ||||||
| #include "tensorflow/python/util/stack_trace.h" | #include "tensorflow/python/util/stack_trace.h" | ||||||
| 
 | 
 | ||||||
|  | #include "tensorflow/core/platform/str_util.h" | ||||||
|  | #include "tensorflow/core/platform/stringpiece.h" | ||||||
|  | 
 | ||||||
| namespace { | namespace { | ||||||
| 
 | 
 | ||||||
| // Returns C string from a Python string object. Handles Python2/3 strings.
 | // Returns C string from a Python string object. Handles Python2/3 strings.
 | ||||||
| @ -31,22 +34,33 @@ const char* GetPythonString(PyObject* o) { | |||||||
|   return PyBytes_AsString(o); |   return PyBytes_AsString(o); | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  | 
 | ||||||
| }  // namespace
 | }  // namespace
 | ||||||
| 
 | 
 | ||||||
| namespace tensorflow { | namespace tensorflow { | ||||||
| std::string StackTrace::ToString() const { |  | ||||||
|   DCheckPyGilState(); |  | ||||||
| 
 | 
 | ||||||
|   std::ostringstream result; | std::vector<StackFrame> StackTrace::ToStackFrames() const { | ||||||
|  |   std::vector<StackFrame> result; | ||||||
|  |   result.reserve(size_); | ||||||
|  | 
 | ||||||
|   for (int i = size_ - 1; i >= 0; --i) { |   for (int i = size_ - 1; i >= 0; --i) { | ||||||
|     result << "  File \"" << PyUnicode_AsUTF8(code_objs_[i]->co_filename) |     const char* file_name = GetPythonString(code_objs_[i]->co_filename); | ||||||
|            << "\", line " |     const int line_number = | ||||||
|            << PyCode_Addr2Line(code_objs_[i], last_instructions_[i]) << ", in " |         PyCode_Addr2Line(code_objs_[i], last_instructions_[i]); | ||||||
|            << GetPythonString(code_objs_[i]->co_name) |     result.emplace_back(StackFrame{file_name, line_number, | ||||||
|            << "\n    <source line unimplemented>\n"; |                                    GetPythonString(code_objs_[i]->co_name)}); | ||||||
|     // TODO(kkb): Add source code line.  See tf_stack.cc's
 |  | ||||||
|     // FrameSummary::line() function.
 |  | ||||||
|   } |   } | ||||||
|   return result.str(); | 
 | ||||||
|  |   return result; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | StackTrace* StackTraceManager::Get(int id) { | ||||||
|  |   DCheckPyGilState(); | ||||||
|  |   if (next_id_ - id > kStackTraceCircularBufferSize) return nullptr; | ||||||
|  | 
 | ||||||
|  |   return &stack_traces_[id & (kStackTraceCircularBufferSize - 1)]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | StackTraceManager* const stack_trace_manager = new StackTraceManager(); | ||||||
|  | 
 | ||||||
| }  // namespace tensorflow
 | }  // namespace tensorflow
 | ||||||
|  | |||||||
| @ -25,6 +25,8 @@ limitations under the License. | |||||||
| 
 | 
 | ||||||
| #include "absl/base/attributes.h" | #include "absl/base/attributes.h" | ||||||
| #include "absl/base/optimization.h" | #include "absl/base/optimization.h" | ||||||
|  | #include "absl/types/optional.h" | ||||||
|  | #include "tensorflow/core/util/abstract_stack_trace.h" | ||||||
| #include "tensorflow/python/lib/core/py_util.h" | #include "tensorflow/python/lib/core/py_util.h" | ||||||
| 
 | 
 | ||||||
| namespace tensorflow { | namespace tensorflow { | ||||||
| @ -82,10 +84,8 @@ class StackTrace final { | |||||||
|     return *this; |     return *this; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   // Returns string representation of the captured stack trace.
 |   // Returns a structured representation of the captured stack trace.
 | ||||||
|   std::string ToString() const; |   std::vector<StackFrame> ToStackFrames() const; | ||||||
| 
 |  | ||||||
|   // TODO(kkb): Implement structured stack trace object getter.
 |  | ||||||
| 
 | 
 | ||||||
|  private: |  private: | ||||||
|   std::array<PyCodeObject*, kMaxDepth> code_objs_; |   std::array<PyCodeObject*, kMaxDepth> code_objs_; | ||||||
| @ -103,6 +103,53 @@ class StackTrace final { | |||||||
|   StackTrace& operator=(const StackTrace&) = delete; |   StackTrace& operator=(const StackTrace&) = delete; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | // A class that manages Python stack traces in a circular buffer. Users can
 | ||||||
|  | // insert stack trace entries and retrive them by ids.
 | ||||||
|  | class StackTraceManager { | ||||||
|  |  public: | ||||||
|  |   static constexpr int kStackTraceCircularBufferSize = 1024; | ||||||
|  | 
 | ||||||
|  |   // Captures the current Python stack trace and returns an id.
 | ||||||
|  |   // Python GIL must be acquired beforehand.
 | ||||||
|  |   ABSL_MUST_USE_RESULT | ||||||
|  |   ABSL_ATTRIBUTE_HOT | ||||||
|  |   int Capture() { | ||||||
|  |     DCheckPyGilState(); | ||||||
|  |     const int id = next_id_++; | ||||||
|  |     const int index = id & (kStackTraceCircularBufferSize - 1); | ||||||
|  |     stack_traces_[index] = StackTrace::Capture(); | ||||||
|  |     return id; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   // Retrieve captured Python stack trace by id. Returns `nullptr` if the
 | ||||||
|  |   // requested stack trace is evicted from the circular buffer.
 | ||||||
|  |   // Python GIL must be acquired beforehand.
 | ||||||
|  |   ABSL_MUST_USE_RESULT | ||||||
|  |   StackTrace* Get(int id); | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   int next_id_ = 0; | ||||||
|  |   std::array<StackTrace, kStackTraceCircularBufferSize> stack_traces_; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // Singleton StackTraceManager.
 | ||||||
|  | extern StackTraceManager* const stack_trace_manager; | ||||||
|  | 
 | ||||||
|  | // Returns Python stack trace object that can be converted to string.
 | ||||||
|  | // Note that the actual stack trace is kept in a circular buffer for string
 | ||||||
|  | // conversion could fail if it's evicted before.
 | ||||||
|  | // Python GIL must be acquired beforehand.
 | ||||||
|  | inline AbstractStackTrace GetStackTrace() { | ||||||
|  |   DCheckPyGilState(); | ||||||
|  |   return AbstractStackTrace(stack_trace_manager->Capture(), [](int id) { | ||||||
|  |     PyGILState_STATE gstate = PyGILState_Ensure(); | ||||||
|  |     std::vector<StackFrame> result = | ||||||
|  |         stack_trace_manager->Get(id)->ToStackFrames(); | ||||||
|  |     PyGILState_Release(gstate); | ||||||
|  |     return result; | ||||||
|  |   }); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| }  // namespace tensorflow
 | }  // namespace tensorflow
 | ||||||
| 
 | 
 | ||||||
| #endif  // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
 | #endif  // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user