Propagate eager mode stack traces into OpKernelCtx

PiperOrigin-RevId: 352853640
Change-Id: I66c70afb9b5004c609a9d98e097be8a48e9be533
This commit is contained in:
George Karpenkov 2021-01-20 12:45:03 -08:00 committed by TensorFlower Gardener
parent 3eac805944
commit 053d50118d
10 changed files with 55 additions and 27 deletions

View File

@ -1537,6 +1537,7 @@ tf_cuda_library(
"@com_google_protobuf//:protobuf",
],
otherwise = [
"//tensorflow/core/util:abstract_stack_trace",
"@com_google_protobuf//:protobuf_headers",
],
),
@ -1625,6 +1626,7 @@ tf_cuda_library(
"//tensorflow/core/platform/default/build_config:platformlib",
"//tensorflow/core/profiler/lib:annotated_traceme",
"//tensorflow/core/profiler/lib:traceme",
"//tensorflow/core/util:abstract_stack_trace",
"//tensorflow/core/util:einsum_op_util",
"//tensorflow/core/util:padding",
"//tensorflow/core/util:port",

View File

@ -664,7 +664,8 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
op->GetCancellationManager(),
{retvals, static_cast<size_t>(num_outputs)});
{retvals, static_cast<size_t>(num_outputs)},
op->GetStackTrace());
Status s = executor.SyncExecute(&node);
// We release the inputs AFTER executing the operation in sync mode since
// ExecuteNode does not increment the reference count and thus does not have
@ -1106,7 +1107,8 @@ Status EagerKernelExecute(
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals) {
absl::Span<TensorHandle*> retvals,
const absl::optional<ManagedStackTrace>& stack_trace) {
profiler::TraceMe activity("EagerKernelExecute",
profiler::TraceMeLevel::kInfo);
std::vector<EagerKernelRet> outputs(1);
@ -1121,7 +1123,8 @@ Status EagerKernelExecute(
// acquires a lock) and we can't recover from errors anyway.
ScopedStepContainer* container = ctx->StepContainer();
TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
cancellation_manager, remote_func_params));
cancellation_manager, remote_func_params,
stack_trace));
if (graph_collector != nullptr) {
CollectGraphs(ctx);
}

View File

@ -52,7 +52,8 @@ Status EagerKernelExecute(
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals);
absl::Span<TensorHandle*> retvals,
const absl::optional<ManagedStackTrace>& stack_trace = {});
// Low-level utility to copy a tensor handle from one device to another. If
// successful, result TensorHandle will be populated. If the caller requests for

View File

@ -97,7 +97,8 @@ class ExecuteNode : public EagerNode {
const core::RefCountPtr<KernelAndDevice>& kernel,
GraphCollector* graph_collector,
CancellationManager* cancellation_manager,
absl::Span<TensorHandle*> retvals)
absl::Span<TensorHandle*> retvals,
absl::optional<ManagedStackTrace> stack_trace)
: EagerNode(),
ctx_(ctx),
inputs_(inputs),
@ -105,7 +106,8 @@ class ExecuteNode : public EagerNode {
kernel_(kernel),
graph_collector_(graph_collector),
cancellation_manager_(cancellation_manager),
retvals_(retvals) {}
retvals_(retvals),
stack_trace_(stack_trace) {}
Status Run() override {
int i = 0;
@ -120,8 +122,8 @@ class ExecuteNode : public EagerNode {
++i;
}
return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
graph_collector_, cancellation_manager_,
retvals_);
graph_collector_, cancellation_manager_, retvals_,
stack_trace_);
}
void Abort(Status status) override {}
@ -140,6 +142,7 @@ class ExecuteNode : public EagerNode {
GraphCollector* graph_collector_;
CancellationManager* const cancellation_manager_;
absl::Span<TensorHandle*> retvals_;
absl::optional<ManagedStackTrace> stack_trace_;
};
class AsyncExecuteNode : public EagerNode {
@ -198,7 +201,7 @@ class AsyncExecuteNode : public EagerNode {
}
Status status = EagerKernelExecute(
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
cancellation_manager_, absl::MakeSpan(retvals_));
cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
if (!status.ok()) {
if (stack_trace_.has_value()) {
status = Status(status.code(), status.error_message(),

View File

@ -243,7 +243,8 @@ Status KernelAndDeviceOp::Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) {
OpKernelContext::Params params;
params.device = device_;
params.frame_iter = FrameAndIter(0, 0);
@ -255,6 +256,7 @@ Status KernelAndDeviceOp::Run(
params.function_library = flr_;
params.slice_reader_cache = &slice_reader_cache_;
params.rendezvous = rendezvous_;
params.stack_trace = stack_trace;
OpExecutionState* op_execution_state = nullptr;
CancellationManager default_cancellation_manager;
@ -320,7 +322,8 @@ Status KernelAndDeviceFunc::Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) {
Notification n;
Status status;
RunAsync(step_container, inputs, outputs, cancellation_manager,

View File

@ -40,6 +40,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/platform/fingerprint.h"
#include "tensorflow/core/util/managed_stack_trace.h"
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
#if !defined(IS_MOBILE_PLATFORM)
#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
@ -131,7 +132,8 @@ class KernelAndDevice : public core::RefCounted {
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) = 0;
// Execute kernel asynchronously when applicable. Different from `Run` which
// blocks the caller thread and waits for the execution of the op/function,
@ -203,11 +205,12 @@ class KernelAndDeviceOp final : public KernelAndDevice {
Status Init(const bool log_device_placement, const NodeDef& ndef,
GraphCollector* graph_collector) override;
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>&
remote_func_params) override;
Status Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) override;
void RunAsync(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
@ -217,7 +220,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
StatusCallback done) override {
// Trivial async implementation on top of the sync version
done(Run(step_container, inputs, outputs, cancellation_manager,
remote_func_params));
remote_func_params, {}));
}
const OpKernel* kernel() const override { return kernel_.get(); }
@ -291,11 +294,12 @@ class KernelAndDeviceFunc : public KernelAndDevice {
Status Init(const bool log_device_placement, const NodeDef& ndef,
GraphCollector* graph_collector) override;
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>&
remote_func_params) override;
Status Run(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
std::vector<EagerKernelRet>* outputs,
CancellationManager* cancellation_manager,
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
const absl::optional<ManagedStackTrace>& stack_trace) override;
void RunAsync(
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,

View File

@ -139,7 +139,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
TF_CHECK_OK(k.Init({}, ndef, nullptr));
const EagerKernelArgs args(std::move(inputs));
for (auto s : state) {
TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
TF_CHECK_OK(
k.Run(nullptr, args, &outputs, nullptr, absl::nullopt, absl::nullopt));
}
}
BENCHMARK(BM_KernelAndDeviceRun);

View File

@ -980,7 +980,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
/*cancellation_manager=*/nullptr,
/*remote_func_params=*/absl::nullopt));
/*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt));
CheckOutputsAndClose(outputs, op_id);
}

View File

@ -110,7 +110,8 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
/*cancellation_manager=*/nullptr,
/*remote_func_params=*/absl::nullopt);
/*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt);
}
void RemoteCopyNode::StartSend() {
@ -195,7 +196,8 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
std::vector<EagerKernelRet> rets;
TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
captured_state_->recv_cancellation(),
/*remote_func_params=*/absl::nullopt));
/*remote_func_params=*/absl::nullopt,
/*stack_trace=*/absl::nullopt));
outputs->clear();
for (const auto& ret : rets) {
if (ret.index() == 0) {

View File

@ -54,6 +54,7 @@ limitations under the License.
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/config.pb.h"
#include "tensorflow/core/util/managed_stack_trace.h"
namespace Eigen {
struct ThreadPoolDevice;
@ -701,6 +702,8 @@ class OpKernelContext {
std::function<void()> inc_num_deferred_ops_function;
std::function<void()> dec_num_deferred_ops_function;
absl::optional<ManagedStackTrace> stack_trace = {};
// For implementing `OpKernelContext::output_required()`. If null, all
// outputs are required.
bool* outputs_required_array = nullptr;
@ -717,6 +720,11 @@ class OpKernelContext {
const OpKernel& op_kernel() const { return *params_->op_kernel; }
// Stack trace of where the op was defined (if defined in eager mode).
const absl::optional<ManagedStackTrace>& stack_trace() const {
return params_->stack_trace;
}
// Input/output signature.
int num_inputs() const { return params_->inputs->size(); }