Propagate eager mode stack traces into OpKernelCtx

PiperOrigin-RevId: 352853640 Change-Id: I66c70afb9b5004c609a9d98e097be8a48e9be533
2021-01-20 12:45:03 -08:00 · 2021-01-20 12:45:03 -08:00 · 053d50118d
commit 053d50118d
parent 3eac805944
10 changed files with 55 additions and 27 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -1537,6 +1537,7 @@ tf_cuda_library(
            "@com_google_protobuf//:protobuf",
        ],
        otherwise = [
            "//tensorflow/core/util:abstract_stack_trace",
            "@com_google_protobuf//:protobuf_headers",
        ],
    ),
@ -1625,6 +1626,7 @@ tf_cuda_library(
        "//tensorflow/core/platform/default/build_config:platformlib",
        "//tensorflow/core/profiler/lib:annotated_traceme",
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/core/util:abstract_stack_trace",
        "//tensorflow/core/util:einsum_op_util",
        "//tensorflow/core/util:padding",
        "//tensorflow/core/util:port",
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@ -664,7 +664,8 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
    TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
    ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
                     op->GetCancellationManager(),
-                     {retvals, static_cast<size_t>(num_outputs)});
+                     {retvals, static_cast<size_t>(num_outputs)},
                     op->GetStackTrace());
    Status s = executor.SyncExecute(&node);
    // We release the inputs AFTER executing the operation in sync mode since
    // ExecuteNode does not increment the reference count and thus does not have
@ -1106,7 +1107,8 @@ Status EagerKernelExecute(
    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
    const core::RefCountPtr<KernelAndDevice>& kernel,
    GraphCollector* graph_collector, CancellationManager* cancellation_manager,
-    absl::Span<TensorHandle*> retvals) {
+    absl::Span<TensorHandle*> retvals,
    const absl::optional<ManagedStackTrace>& stack_trace) {
  profiler::TraceMe activity("EagerKernelExecute",
                             profiler::TraceMeLevel::kInfo);
  std::vector<EagerKernelRet> outputs(1);
@ -1121,7 +1123,8 @@ Status EagerKernelExecute(
  // acquires a lock) and we can't recover from errors anyway.
  ScopedStepContainer* container = ctx->StepContainer();
  TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
-                                 cancellation_manager, remote_func_params));
+                                 cancellation_manager, remote_func_params,
                                 stack_trace));
  if (graph_collector != nullptr) {
    CollectGraphs(ctx);
  }
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@ -52,7 +52,8 @@ Status EagerKernelExecute(
    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
    const core::RefCountPtr<KernelAndDevice>& kernel,
    GraphCollector* graph_collector, CancellationManager* cancellation_manager,
-    absl::Span<TensorHandle*> retvals);
+    absl::Span<TensorHandle*> retvals,
    const absl::optional<ManagedStackTrace>& stack_trace = {});
 // Low-level utility to copy a tensor handle from one device to another. If
 // successful, result TensorHandle will be populated. If the caller requests for
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@ -97,7 +97,8 @@ class ExecuteNode : public EagerNode {
      const core::RefCountPtr<KernelAndDevice>& kernel,
      GraphCollector* graph_collector,
      CancellationManager* cancellation_manager,
-      absl::Span<TensorHandle*> retvals)
+      absl::Span<TensorHandle*> retvals,
      absl::optional<ManagedStackTrace> stack_trace)
      : EagerNode(),
        ctx_(ctx),
        inputs_(inputs),
@ -105,7 +106,8 @@ class ExecuteNode : public EagerNode {
        kernel_(kernel),
        graph_collector_(graph_collector),
        cancellation_manager_(cancellation_manager),
-        retvals_(retvals) {}
+        retvals_(retvals),
        stack_trace_(stack_trace) {}
  Status Run() override {
    int i = 0;
@ -120,8 +122,8 @@ class ExecuteNode : public EagerNode {
      ++i;
    }
    return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
-                              graph_collector_, cancellation_manager_,
+                              graph_collector_, cancellation_manager_, retvals_,
-                              retvals_);
+                              stack_trace_);
  }
  void Abort(Status status) override {}
@ -140,6 +142,7 @@ class ExecuteNode : public EagerNode {
  GraphCollector* graph_collector_;
  CancellationManager* const cancellation_manager_;
  absl::Span<TensorHandle*> retvals_;
  absl::optional<ManagedStackTrace> stack_trace_;
 };
 class AsyncExecuteNode : public EagerNode {
@ -198,7 +201,7 @@ class AsyncExecuteNode : public EagerNode {
    }
    Status status = EagerKernelExecute(
        ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
-        cancellation_manager_, absl::MakeSpan(retvals_));
+        cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
    if (!status.ok()) {
      if (stack_trace_.has_value()) {
        status = Status(status.code(), status.error_message(),
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@ -243,7 +243,8 @@ Status KernelAndDeviceOp::Run(
    ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
    std::vector<EagerKernelRet>* outputs,
    CancellationManager* cancellation_manager,
-    const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
    const absl::optional<ManagedStackTrace>& stack_trace) {
  OpKernelContext::Params params;
  params.device = device_;
  params.frame_iter = FrameAndIter(0, 0);
@ -255,6 +256,7 @@ Status KernelAndDeviceOp::Run(
  params.function_library = flr_;
  params.slice_reader_cache = &slice_reader_cache_;
  params.rendezvous = rendezvous_;
  params.stack_trace = stack_trace;
  OpExecutionState* op_execution_state = nullptr;
  CancellationManager default_cancellation_manager;
@ -320,7 +322,8 @@ Status KernelAndDeviceFunc::Run(
    ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
    std::vector<EagerKernelRet>* outputs,
    CancellationManager* cancellation_manager,
-    const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
+    const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
    const absl::optional<ManagedStackTrace>& stack_trace) {
  Notification n;
  Status status;
  RunAsync(step_container, inputs, outputs, cancellation_manager,
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
@ -131,7 +132,8 @@ class KernelAndDevice : public core::RefCounted {
      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
      std::vector<EagerKernelRet>* outputs,
      CancellationManager* cancellation_manager,
-      const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
      const absl::optional<ManagedStackTrace>& stack_trace) = 0;
  // Execute kernel asynchronously when applicable. Different from `Run` which
  // blocks the caller thread and waits for the execution of the op/function,
@ -203,11 +205,12 @@ class KernelAndDeviceOp final : public KernelAndDevice {
  Status Init(const bool log_device_placement, const NodeDef& ndef,
              GraphCollector* graph_collector) override;
-  Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+  Status Run(
      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
      std::vector<EagerKernelRet>* outputs,
      CancellationManager* cancellation_manager,
-             const absl::optional<EagerRemoteFunctionParams>&
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
-                 remote_func_params) override;
+      const absl::optional<ManagedStackTrace>& stack_trace) override;
  void RunAsync(
      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
@ -217,7 +220,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
      StatusCallback done) override {
    // Trivial async implementation on top of the sync version
    done(Run(step_container, inputs, outputs, cancellation_manager,
-             remote_func_params));
+             remote_func_params, {}));
  }
  const OpKernel* kernel() const override { return kernel_.get(); }
@ -291,11 +294,12 @@ class KernelAndDeviceFunc : public KernelAndDevice {
  Status Init(const bool log_device_placement, const NodeDef& ndef,
              GraphCollector* graph_collector) override;
-  Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
+  Status Run(
      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
      std::vector<EagerKernelRet>* outputs,
      CancellationManager* cancellation_manager,
-             const absl::optional<EagerRemoteFunctionParams>&
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
-                 remote_func_params) override;
+      const absl::optional<ManagedStackTrace>& stack_trace) override;
  void RunAsync(
      ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@ -139,7 +139,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
  TF_CHECK_OK(k.Init({}, ndef, nullptr));
  const EagerKernelArgs args(std::move(inputs));
  for (auto s : state) {
-    TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
+    TF_CHECK_OK(
        k.Run(nullptr, args, &outputs, nullptr, absl::nullopt, absl::nullopt));
  }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@ -980,7 +980,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
  TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
                           /*cancellation_manager=*/nullptr,
-                           /*remote_func_params=*/absl::nullopt));
+                           /*remote_func_params=*/absl::nullopt,
                           /*stack_trace=*/absl::nullopt));
  CheckOutputsAndClose(outputs, op_id);
 }
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@ -110,7 +110,8 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
  return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
                     /*cancellation_manager=*/nullptr,
-                     /*remote_func_params=*/absl::nullopt);
+                     /*remote_func_params=*/absl::nullopt,
                     /*stack_trace=*/absl::nullopt);
 }
 void RemoteCopyNode::StartSend() {
@ -195,7 +196,8 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
  std::vector<EagerKernelRet> rets;
  TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
                                 captured_state_->recv_cancellation(),
-                                 /*remote_func_params=*/absl::nullopt));
+                                 /*remote_func_params=*/absl::nullopt,
                                 /*stack_trace=*/absl::nullopt));
  outputs->clear();
  for (const auto& ret : rets) {
    if (ret.index() == 0) {
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
 namespace Eigen {
 struct ThreadPoolDevice;
@ -701,6 +702,8 @@ class OpKernelContext {
    std::function<void()> inc_num_deferred_ops_function;
    std::function<void()> dec_num_deferred_ops_function;
    absl::optional<ManagedStackTrace> stack_trace = {};
    // For implementing `OpKernelContext::output_required()`. If null, all
    // outputs are required.
    bool* outputs_required_array = nullptr;
@ -717,6 +720,11 @@ class OpKernelContext {
  const OpKernel& op_kernel() const { return *params_->op_kernel; }
  // Stack trace of where the op was defined (if defined in eager mode).
  const absl::optional<ManagedStackTrace>& stack_trace() const {
    return params_->stack_trace;
  }
  // Input/output signature.
  int num_inputs() const { return params_->inputs->size(); }