Propagate eager mode stack traces into OpKernelCtx

PiperOrigin-RevId: 352853640
Change-Id: I66c70afb9b5004c609a9d98e097be8a48e9be533
This commit is contained in:
George Karpenkov 2021-01-20 12:45:03 -08:00 committed by TensorFlower Gardener
parent 3eac805944
commit 053d50118d
10 changed files with 55 additions and 27 deletions

View File

@ -1537,6 +1537,7 @@ tf_cuda_library(
"@com_google_protobuf//:protobuf", "@com_google_protobuf//:protobuf",
], ],
otherwise = [ otherwise = [
"//tensorflow/core/util:abstract_stack_trace",
"@com_google_protobuf//:protobuf_headers", "@com_google_protobuf//:protobuf_headers",
], ],
), ),
@ -1625,6 +1626,7 @@ tf_cuda_library(
"//tensorflow/core/platform/default/build_config:platformlib", "//tensorflow/core/platform/default/build_config:platformlib",
"//tensorflow/core/profiler/lib:annotated_traceme", "//tensorflow/core/profiler/lib:annotated_traceme",
"//tensorflow/core/profiler/lib:traceme", "//tensorflow/core/profiler/lib:traceme",
"//tensorflow/core/util:abstract_stack_trace",
"//tensorflow/core/util:einsum_op_util", "//tensorflow/core/util:einsum_op_util",
"//tensorflow/core/util:padding", "//tensorflow/core/util:padding",
"//tensorflow/core/util:port", "//tensorflow/core/util:port",

View File

@ -664,7 +664,8 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs)); TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector, ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
op->GetCancellationManager(), op->GetCancellationManager(),
{retvals, static_cast<size_t>(num_outputs)}); {retvals, static_cast<size_t>(num_outputs)},
op->GetStackTrace());
Status s = executor.SyncExecute(&node); Status s = executor.SyncExecute(&node);
// We release the inputs AFTER executing the operation in sync mode since // We release the inputs AFTER executing the operation in sync mode since
// ExecuteNode does not increment the reference count and thus does not have // ExecuteNode does not increment the reference count and thus does not have
@ -1106,7 +1107,8 @@ Status EagerKernelExecute(
const absl::optional<EagerRemoteFunctionParams>& remote_func_params, const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const core::RefCountPtr<KernelAndDevice>& kernel, const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector, CancellationManager* cancellation_manager, GraphCollector* graph_collector, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals) { absl::Span<TensorHandle*> retvals,
const absl::optional<ManagedStackTrace>& stack_trace) {
profiler::TraceMe activity("EagerKernelExecute", profiler::TraceMe activity("EagerKernelExecute",
profiler::TraceMeLevel::kInfo); profiler::TraceMeLevel::kInfo);
std::vector<EagerKernelRet> outputs(1); std::vector<EagerKernelRet> outputs(1);
@ -1121,7 +1123,8 @@ Status EagerKernelExecute(
// acquires a lock) and we can't recover from errors anyway. // acquires a lock) and we can't recover from errors anyway.
ScopedStepContainer* container = ctx->StepContainer(); ScopedStepContainer* container = ctx->StepContainer();
TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs, TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
cancellation_manager, remote_func_params)); cancellation_manager, remote_func_params,
stack_trace));
if (graph_collector != nullptr) { if (graph_collector != nullptr) {
CollectGraphs(ctx); CollectGraphs(ctx);
} }

View File

@ -52,7 +52,8 @@ Status EagerKernelExecute(
const absl::optional<EagerRemoteFunctionParams>& remote_func_params, const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const core::RefCountPtr<KernelAndDevice>& kernel, const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector, CancellationManager* cancellation_manager, GraphCollector* graph_collector, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals); absl::Span<TensorHandle*> retvals,
const absl::optional<ManagedStackTrace>& stack_trace = {});
// Low-level utility to copy a tensor handle from one device to another. If // Low-level utility to copy a tensor handle from one device to another. If
// successful, result TensorHandle will be populated. If the caller requests for // successful, result TensorHandle will be populated. If the caller requests for

View File

@ -97,7 +97,8 @@ class ExecuteNode : public EagerNode {
const core::RefCountPtr<KernelAndDevice>& kernel, const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector, GraphCollector* graph_collector,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals) absl::Span<TensorHandle*> retvals,
absl::optional<ManagedStackTrace> stack_trace)
: EagerNode(), : EagerNode(),
ctx_(ctx), ctx_(ctx),
inputs_(inputs), inputs_(inputs),
@ -105,7 +106,8 @@ class ExecuteNode : public EagerNode {
kernel_(kernel), kernel_(kernel),
graph_collector_(graph_collector), graph_collector_(graph_collector),
cancellation_manager_(cancellation_manager), cancellation_manager_(cancellation_manager),
retvals_(retvals) {} retvals_(retvals),
stack_trace_(stack_trace) {}
Status Run() override { Status Run() override {
int i = 0; int i = 0;
@ -120,8 +122,8 @@ class ExecuteNode : public EagerNode {
++i; ++i;
} }
return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_, return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
graph_collector_, cancellation_manager_, graph_collector_, cancellation_manager_, retvals_,
retvals_); stack_trace_);
} }
void Abort(Status status) override {} void Abort(Status status) override {}
@ -140,6 +142,7 @@ class ExecuteNode : public EagerNode {
GraphCollector* graph_collector_; GraphCollector* graph_collector_;
CancellationManager* const cancellation_manager_; CancellationManager* const cancellation_manager_;
absl::Span<TensorHandle*> retvals_; absl::Span<TensorHandle*> retvals_;
absl::optional<ManagedStackTrace> stack_trace_;
}; };
class AsyncExecuteNode : public EagerNode { class AsyncExecuteNode : public EagerNode {
@ -198,7 +201,7 @@ class AsyncExecuteNode : public EagerNode {
} }
Status status = EagerKernelExecute( Status status = EagerKernelExecute(
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_, ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
cancellation_manager_, absl::MakeSpan(retvals_)); cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
if (!status.ok()) { if (!status.ok()) {
if (stack_trace_.has_value()) { if (stack_trace_.has_value()) {
status = Status(status.code(), status.error_message(), status = Status(status.code(), status.error_message(),

View File

@ -243,7 +243,8 @@ Status KernelAndDeviceOp::Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs, ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs, std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) { const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) {
OpKernelContext::Params params; OpKernelContext::Params params;
params.device = device_; params.device = device_;
params.frame_iter = FrameAndIter(0, 0); params.frame_iter = FrameAndIter(0, 0);
@ -255,6 +256,7 @@ Status KernelAndDeviceOp::Run(
params.function_library = flr_; params.function_library = flr_;
params.slice_reader_cache = &slice_reader_cache_; params.slice_reader_cache = &slice_reader_cache_;
params.rendezvous = rendezvous_; params.rendezvous = rendezvous_;
params.stack_trace = stack_trace;
OpExecutionState* op_execution_state = nullptr; OpExecutionState* op_execution_state = nullptr;
CancellationManager default_cancellation_manager; CancellationManager default_cancellation_manager;
@ -320,7 +322,8 @@ Status KernelAndDeviceFunc::Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs, ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs, std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) { const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) {
Notification n; Notification n;
Status status; Status status;
RunAsync(step_container, inputs, outputs, cancellation_manager, RunAsync(step_container, inputs, outputs, cancellation_manager,

View File

@ -40,6 +40,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/platform/fingerprint.h" #include "tensorflow/core/platform/fingerprint.h"
#include "tensorflow/core/util/managed_stack_trace.h"
#include "tensorflow/core/util/tensor_slice_reader_cache.h" #include "tensorflow/core/util/tensor_slice_reader_cache.h"
#if !defined(IS_MOBILE_PLATFORM) #if !defined(IS_MOBILE_PLATFORM)
#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h" #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
@ -131,7 +132,8 @@ class KernelAndDevice : public core::RefCounted {
ScopedStepContainer* step_container, const EagerKernelArgs& inputs, ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs, std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0; const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) = 0;
// Execute kernel asynchronously when applicable. Different from `Run` which // Execute kernel asynchronously when applicable. Different from `Run` which
// blocks the caller thread and waits for the execution of the op/function, // blocks the caller thread and waits for the execution of the op/function,
@ -203,11 +205,12 @@ class KernelAndDeviceOp final : public KernelAndDevice {
Status Init(const bool log_device_placement, const NodeDef& ndef, Status Init(const bool log_device_placement, const NodeDef& ndef,
GraphCollector* graph_collector) override; GraphCollector* graph_collector) override;
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs, Status Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs, std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
remote_func_params) override; const absl::optional<ManagedStackTrace>& stack_trace) override;
void RunAsync( void RunAsync(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs, ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
@ -217,7 +220,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
StatusCallback done) override { StatusCallback done) override {
// Trivial async implementation on top of the sync version // Trivial async implementation on top of the sync version
done(Run(step_container, inputs, outputs, cancellation_manager, done(Run(step_container, inputs, outputs, cancellation_manager,
remote_func_params)); remote_func_params, {}));
} }
const OpKernel* kernel() const override { return kernel_.get(); } const OpKernel* kernel() const override { return kernel_.get(); }
@ -291,11 +294,12 @@ class KernelAndDeviceFunc : public KernelAndDevice {
Status Init(const bool log_device_placement, const NodeDef& ndef, Status Init(const bool log_device_placement, const NodeDef& ndef,
GraphCollector* graph_collector) override; GraphCollector* graph_collector) override;
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs, Status Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs, std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager, CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
remote_func_params) override; const absl::optional<ManagedStackTrace>& stack_trace) override;
void RunAsync( void RunAsync(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs, ScopedStepContainer* step_container, const EagerKernelArgs& inputs,

View File

@ -139,7 +139,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
TF_CHECK_OK(k.Init({}, ndef, nullptr)); TF_CHECK_OK(k.Init({}, ndef, nullptr));
const EagerKernelArgs args(std::move(inputs)); const EagerKernelArgs args(std::move(inputs));
for (auto s : state) { for (auto s : state) {
TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt)); TF_CHECK_OK(
k.Run(nullptr, args, &outputs, nullptr, absl::nullopt, absl::nullopt));
} }
} }
BENCHMARK(BM_KernelAndDeviceRun); BENCHMARK(BM_KernelAndDeviceRun);

View File

@ -980,7 +980,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs, TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
/*cancellation_manager=*/nullptr, /*cancellation_manager=*/nullptr,
/*remote_func_params=*/absl::nullopt)); /*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt));
CheckOutputsAndClose(outputs, op_id); CheckOutputsAndClose(outputs, op_id);
} }

View File

@ -110,7 +110,8 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr, return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
/*cancellation_manager=*/nullptr, /*cancellation_manager=*/nullptr,
/*remote_func_params=*/absl::nullopt); /*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt);
} }
void RemoteCopyNode::StartSend() { void RemoteCopyNode::StartSend() {
@ -195,7 +196,8 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
std::vector<EagerKernelRet> rets; std::vector<EagerKernelRet> rets;
TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets, TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
captured_state_->recv_cancellation(), captured_state_->recv_cancellation(),
/*remote_func_params=*/absl::nullopt)); /*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt));
outputs->clear(); outputs->clear();
for (const auto& ret : rets) { for (const auto& ret : rets) {
if (ret.index() == 0) { if (ret.index() == 0) {

View File

@ -54,6 +54,7 @@ limitations under the License.
#include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/protobuf/config.pb.h"
#include "tensorflow/core/util/managed_stack_trace.h"
namespace Eigen { namespace Eigen {
struct ThreadPoolDevice; struct ThreadPoolDevice;
@ -701,6 +702,8 @@ class OpKernelContext {
std::function<void()> inc_num_deferred_ops_function; std::function<void()> inc_num_deferred_ops_function;
std::function<void()> dec_num_deferred_ops_function; std::function<void()> dec_num_deferred_ops_function;
absl::optional<ManagedStackTrace> stack_trace = {};
// For implementing `OpKernelContext::output_required()`. If null, all // For implementing `OpKernelContext::output_required()`. If null, all
// outputs are required. // outputs are required.
bool* outputs_required_array = nullptr; bool* outputs_required_array = nullptr;
@ -717,6 +720,11 @@ class OpKernelContext {
const OpKernel& op_kernel() const { return *params_->op_kernel; } const OpKernel& op_kernel() const { return *params_->op_kernel; }
// Stack trace of where the op was defined (if defined in eager mode).
const absl::optional<ManagedStackTrace>& stack_trace() const {
return params_->stack_trace;
}
// Input/output signature. // Input/output signature.
int num_inputs() const { return params_->inputs->size(); } int num_inputs() const { return params_->inputs->size(); }