Propagate eager mode stack traces into OpKernelCtx
PiperOrigin-RevId: 352853640 Change-Id: I66c70afb9b5004c609a9d98e097be8a48e9be533
This commit is contained in:
parent
3eac805944
commit
053d50118d
@ -1537,6 +1537,7 @@ tf_cuda_library(
|
|||||||
"@com_google_protobuf//:protobuf",
|
"@com_google_protobuf//:protobuf",
|
||||||
],
|
],
|
||||||
otherwise = [
|
otherwise = [
|
||||||
|
"//tensorflow/core/util:abstract_stack_trace",
|
||||||
"@com_google_protobuf//:protobuf_headers",
|
"@com_google_protobuf//:protobuf_headers",
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
@ -1625,6 +1626,7 @@ tf_cuda_library(
|
|||||||
"//tensorflow/core/platform/default/build_config:platformlib",
|
"//tensorflow/core/platform/default/build_config:platformlib",
|
||||||
"//tensorflow/core/profiler/lib:annotated_traceme",
|
"//tensorflow/core/profiler/lib:annotated_traceme",
|
||||||
"//tensorflow/core/profiler/lib:traceme",
|
"//tensorflow/core/profiler/lib:traceme",
|
||||||
|
"//tensorflow/core/util:abstract_stack_trace",
|
||||||
"//tensorflow/core/util:einsum_op_util",
|
"//tensorflow/core/util:einsum_op_util",
|
||||||
"//tensorflow/core/util:padding",
|
"//tensorflow/core/util:padding",
|
||||||
"//tensorflow/core/util:port",
|
"//tensorflow/core/util:port",
|
||||||
|
@ -664,7 +664,8 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
|
|||||||
TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
|
TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
|
||||||
ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
|
ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
|
||||||
op->GetCancellationManager(),
|
op->GetCancellationManager(),
|
||||||
{retvals, static_cast<size_t>(num_outputs)});
|
{retvals, static_cast<size_t>(num_outputs)},
|
||||||
|
op->GetStackTrace());
|
||||||
Status s = executor.SyncExecute(&node);
|
Status s = executor.SyncExecute(&node);
|
||||||
// We release the inputs AFTER executing the operation in sync mode since
|
// We release the inputs AFTER executing the operation in sync mode since
|
||||||
// ExecuteNode does not increment the reference count and thus does not have
|
// ExecuteNode does not increment the reference count and thus does not have
|
||||||
@ -1106,7 +1107,8 @@ Status EagerKernelExecute(
|
|||||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||||
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
||||||
absl::Span<TensorHandle*> retvals) {
|
absl::Span<TensorHandle*> retvals,
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||||
profiler::TraceMe activity("EagerKernelExecute",
|
profiler::TraceMe activity("EagerKernelExecute",
|
||||||
profiler::TraceMeLevel::kInfo);
|
profiler::TraceMeLevel::kInfo);
|
||||||
std::vector<EagerKernelRet> outputs(1);
|
std::vector<EagerKernelRet> outputs(1);
|
||||||
@ -1121,7 +1123,8 @@ Status EagerKernelExecute(
|
|||||||
// acquires a lock) and we can't recover from errors anyway.
|
// acquires a lock) and we can't recover from errors anyway.
|
||||||
ScopedStepContainer* container = ctx->StepContainer();
|
ScopedStepContainer* container = ctx->StepContainer();
|
||||||
TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
|
TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
|
||||||
cancellation_manager, remote_func_params));
|
cancellation_manager, remote_func_params,
|
||||||
|
stack_trace));
|
||||||
if (graph_collector != nullptr) {
|
if (graph_collector != nullptr) {
|
||||||
CollectGraphs(ctx);
|
CollectGraphs(ctx);
|
||||||
}
|
}
|
||||||
|
@ -52,7 +52,8 @@ Status EagerKernelExecute(
|
|||||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||||
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
||||||
absl::Span<TensorHandle*> retvals);
|
absl::Span<TensorHandle*> retvals,
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace = {});
|
||||||
|
|
||||||
// Low-level utility to copy a tensor handle from one device to another. If
|
// Low-level utility to copy a tensor handle from one device to another. If
|
||||||
// successful, result TensorHandle will be populated. If the caller requests for
|
// successful, result TensorHandle will be populated. If the caller requests for
|
||||||
|
@ -97,7 +97,8 @@ class ExecuteNode : public EagerNode {
|
|||||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||||
GraphCollector* graph_collector,
|
GraphCollector* graph_collector,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
absl::Span<TensorHandle*> retvals)
|
absl::Span<TensorHandle*> retvals,
|
||||||
|
absl::optional<ManagedStackTrace> stack_trace)
|
||||||
: EagerNode(),
|
: EagerNode(),
|
||||||
ctx_(ctx),
|
ctx_(ctx),
|
||||||
inputs_(inputs),
|
inputs_(inputs),
|
||||||
@ -105,7 +106,8 @@ class ExecuteNode : public EagerNode {
|
|||||||
kernel_(kernel),
|
kernel_(kernel),
|
||||||
graph_collector_(graph_collector),
|
graph_collector_(graph_collector),
|
||||||
cancellation_manager_(cancellation_manager),
|
cancellation_manager_(cancellation_manager),
|
||||||
retvals_(retvals) {}
|
retvals_(retvals),
|
||||||
|
stack_trace_(stack_trace) {}
|
||||||
|
|
||||||
Status Run() override {
|
Status Run() override {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@ -120,8 +122,8 @@ class ExecuteNode : public EagerNode {
|
|||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
|
return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
|
||||||
graph_collector_, cancellation_manager_,
|
graph_collector_, cancellation_manager_, retvals_,
|
||||||
retvals_);
|
stack_trace_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Abort(Status status) override {}
|
void Abort(Status status) override {}
|
||||||
@ -140,6 +142,7 @@ class ExecuteNode : public EagerNode {
|
|||||||
GraphCollector* graph_collector_;
|
GraphCollector* graph_collector_;
|
||||||
CancellationManager* const cancellation_manager_;
|
CancellationManager* const cancellation_manager_;
|
||||||
absl::Span<TensorHandle*> retvals_;
|
absl::Span<TensorHandle*> retvals_;
|
||||||
|
absl::optional<ManagedStackTrace> stack_trace_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class AsyncExecuteNode : public EagerNode {
|
class AsyncExecuteNode : public EagerNode {
|
||||||
@ -198,7 +201,7 @@ class AsyncExecuteNode : public EagerNode {
|
|||||||
}
|
}
|
||||||
Status status = EagerKernelExecute(
|
Status status = EagerKernelExecute(
|
||||||
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
|
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
|
||||||
cancellation_manager_, absl::MakeSpan(retvals_));
|
cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
if (stack_trace_.has_value()) {
|
if (stack_trace_.has_value()) {
|
||||||
status = Status(status.code(), status.error_message(),
|
status = Status(status.code(), status.error_message(),
|
||||||
|
@ -243,7 +243,8 @@ Status KernelAndDeviceOp::Run(
|
|||||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
std::vector<EagerKernelRet>* outputs,
|
std::vector<EagerKernelRet>* outputs,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||||
OpKernelContext::Params params;
|
OpKernelContext::Params params;
|
||||||
params.device = device_;
|
params.device = device_;
|
||||||
params.frame_iter = FrameAndIter(0, 0);
|
params.frame_iter = FrameAndIter(0, 0);
|
||||||
@ -255,6 +256,7 @@ Status KernelAndDeviceOp::Run(
|
|||||||
params.function_library = flr_;
|
params.function_library = flr_;
|
||||||
params.slice_reader_cache = &slice_reader_cache_;
|
params.slice_reader_cache = &slice_reader_cache_;
|
||||||
params.rendezvous = rendezvous_;
|
params.rendezvous = rendezvous_;
|
||||||
|
params.stack_trace = stack_trace;
|
||||||
OpExecutionState* op_execution_state = nullptr;
|
OpExecutionState* op_execution_state = nullptr;
|
||||||
|
|
||||||
CancellationManager default_cancellation_manager;
|
CancellationManager default_cancellation_manager;
|
||||||
@ -320,7 +322,8 @@ Status KernelAndDeviceFunc::Run(
|
|||||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
std::vector<EagerKernelRet>* outputs,
|
std::vector<EagerKernelRet>* outputs,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||||
Notification n;
|
Notification n;
|
||||||
Status status;
|
Status status;
|
||||||
RunAsync(step_container, inputs, outputs, cancellation_manager,
|
RunAsync(step_container, inputs, outputs, cancellation_manager,
|
||||||
|
@ -40,6 +40,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/lib/core/status.h"
|
#include "tensorflow/core/lib/core/status.h"
|
||||||
#include "tensorflow/core/lib/gtl/inlined_vector.h"
|
#include "tensorflow/core/lib/gtl/inlined_vector.h"
|
||||||
#include "tensorflow/core/platform/fingerprint.h"
|
#include "tensorflow/core/platform/fingerprint.h"
|
||||||
|
#include "tensorflow/core/util/managed_stack_trace.h"
|
||||||
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
|
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
|
||||||
#if !defined(IS_MOBILE_PLATFORM)
|
#if !defined(IS_MOBILE_PLATFORM)
|
||||||
#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
|
#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
|
||||||
@ -131,7 +132,8 @@ class KernelAndDevice : public core::RefCounted {
|
|||||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
std::vector<EagerKernelRet>* outputs,
|
std::vector<EagerKernelRet>* outputs,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace) = 0;
|
||||||
|
|
||||||
// Execute kernel asynchronously when applicable. Different from `Run` which
|
// Execute kernel asynchronously when applicable. Different from `Run` which
|
||||||
// blocks the caller thread and waits for the execution of the op/function,
|
// blocks the caller thread and waits for the execution of the op/function,
|
||||||
@ -203,11 +205,12 @@ class KernelAndDeviceOp final : public KernelAndDevice {
|
|||||||
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
||||||
GraphCollector* graph_collector) override;
|
GraphCollector* graph_collector) override;
|
||||||
|
|
||||||
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
Status Run(
|
||||||
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
std::vector<EagerKernelRet>* outputs,
|
std::vector<EagerKernelRet>* outputs,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
const absl::optional<EagerRemoteFunctionParams>&
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
remote_func_params) override;
|
const absl::optional<ManagedStackTrace>& stack_trace) override;
|
||||||
|
|
||||||
void RunAsync(
|
void RunAsync(
|
||||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
@ -217,7 +220,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
|
|||||||
StatusCallback done) override {
|
StatusCallback done) override {
|
||||||
// Trivial async implementation on top of the sync version
|
// Trivial async implementation on top of the sync version
|
||||||
done(Run(step_container, inputs, outputs, cancellation_manager,
|
done(Run(step_container, inputs, outputs, cancellation_manager,
|
||||||
remote_func_params));
|
remote_func_params, {}));
|
||||||
}
|
}
|
||||||
|
|
||||||
const OpKernel* kernel() const override { return kernel_.get(); }
|
const OpKernel* kernel() const override { return kernel_.get(); }
|
||||||
@ -291,11 +294,12 @@ class KernelAndDeviceFunc : public KernelAndDevice {
|
|||||||
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
||||||
GraphCollector* graph_collector) override;
|
GraphCollector* graph_collector) override;
|
||||||
|
|
||||||
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
Status Run(
|
||||||
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
std::vector<EagerKernelRet>* outputs,
|
std::vector<EagerKernelRet>* outputs,
|
||||||
CancellationManager* cancellation_manager,
|
CancellationManager* cancellation_manager,
|
||||||
const absl::optional<EagerRemoteFunctionParams>&
|
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||||
remote_func_params) override;
|
const absl::optional<ManagedStackTrace>& stack_trace) override;
|
||||||
|
|
||||||
void RunAsync(
|
void RunAsync(
|
||||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||||
|
@ -139,7 +139,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
|
|||||||
TF_CHECK_OK(k.Init({}, ndef, nullptr));
|
TF_CHECK_OK(k.Init({}, ndef, nullptr));
|
||||||
const EagerKernelArgs args(std::move(inputs));
|
const EagerKernelArgs args(std::move(inputs));
|
||||||
for (auto s : state) {
|
for (auto s : state) {
|
||||||
TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
|
TF_CHECK_OK(
|
||||||
|
k.Run(nullptr, args, &outputs, nullptr, absl::nullopt, absl::nullopt));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
BENCHMARK(BM_KernelAndDeviceRun);
|
BENCHMARK(BM_KernelAndDeviceRun);
|
||||||
|
@ -980,7 +980,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
|
|||||||
|
|
||||||
TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
|
TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
|
||||||
/*cancellation_manager=*/nullptr,
|
/*cancellation_manager=*/nullptr,
|
||||||
/*remote_func_params=*/absl::nullopt));
|
/*remote_func_params=*/absl::nullopt,
|
||||||
|
/*stack_trace=*/absl::nullopt));
|
||||||
|
|
||||||
CheckOutputsAndClose(outputs, op_id);
|
CheckOutputsAndClose(outputs, op_id);
|
||||||
}
|
}
|
||||||
|
@ -110,7 +110,8 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
|
|||||||
|
|
||||||
return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
|
return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
|
||||||
/*cancellation_manager=*/nullptr,
|
/*cancellation_manager=*/nullptr,
|
||||||
/*remote_func_params=*/absl::nullopt);
|
/*remote_func_params=*/absl::nullopt,
|
||||||
|
/*stack_trace=*/absl::nullopt);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RemoteCopyNode::StartSend() {
|
void RemoteCopyNode::StartSend() {
|
||||||
@ -195,7 +196,8 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
|
|||||||
std::vector<EagerKernelRet> rets;
|
std::vector<EagerKernelRet> rets;
|
||||||
TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
|
TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
|
||||||
captured_state_->recv_cancellation(),
|
captured_state_->recv_cancellation(),
|
||||||
/*remote_func_params=*/absl::nullopt));
|
/*remote_func_params=*/absl::nullopt,
|
||||||
|
/*stack_trace=*/absl::nullopt));
|
||||||
outputs->clear();
|
outputs->clear();
|
||||||
for (const auto& ret : rets) {
|
for (const auto& ret : rets) {
|
||||||
if (ret.index() == 0) {
|
if (ret.index() == 0) {
|
||||||
|
@ -54,6 +54,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/thread_annotations.h"
|
#include "tensorflow/core/platform/thread_annotations.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/protobuf/config.pb.h"
|
#include "tensorflow/core/protobuf/config.pb.h"
|
||||||
|
#include "tensorflow/core/util/managed_stack_trace.h"
|
||||||
|
|
||||||
namespace Eigen {
|
namespace Eigen {
|
||||||
struct ThreadPoolDevice;
|
struct ThreadPoolDevice;
|
||||||
@ -701,6 +702,8 @@ class OpKernelContext {
|
|||||||
std::function<void()> inc_num_deferred_ops_function;
|
std::function<void()> inc_num_deferred_ops_function;
|
||||||
std::function<void()> dec_num_deferred_ops_function;
|
std::function<void()> dec_num_deferred_ops_function;
|
||||||
|
|
||||||
|
absl::optional<ManagedStackTrace> stack_trace = {};
|
||||||
|
|
||||||
// For implementing `OpKernelContext::output_required()`. If null, all
|
// For implementing `OpKernelContext::output_required()`. If null, all
|
||||||
// outputs are required.
|
// outputs are required.
|
||||||
bool* outputs_required_array = nullptr;
|
bool* outputs_required_array = nullptr;
|
||||||
@ -717,6 +720,11 @@ class OpKernelContext {
|
|||||||
|
|
||||||
const OpKernel& op_kernel() const { return *params_->op_kernel; }
|
const OpKernel& op_kernel() const { return *params_->op_kernel; }
|
||||||
|
|
||||||
|
// Stack trace of where the op was defined (if defined in eager mode).
|
||||||
|
const absl::optional<ManagedStackTrace>& stack_trace() const {
|
||||||
|
return params_->stack_trace;
|
||||||
|
}
|
||||||
|
|
||||||
// Input/output signature.
|
// Input/output signature.
|
||||||
|
|
||||||
int num_inputs() const { return params_->inputs->size(); }
|
int num_inputs() const { return params_->inputs->size(); }
|
||||||
|
Loading…
x
Reference in New Issue
Block a user