Propagate eager mode stack traces into OpKernelCtx
PiperOrigin-RevId: 352853640 Change-Id: I66c70afb9b5004c609a9d98e097be8a48e9be533
This commit is contained in:
parent
3eac805944
commit
053d50118d
@ -1537,6 +1537,7 @@ tf_cuda_library(
|
||||
"@com_google_protobuf//:protobuf",
|
||||
],
|
||||
otherwise = [
|
||||
"//tensorflow/core/util:abstract_stack_trace",
|
||||
"@com_google_protobuf//:protobuf_headers",
|
||||
],
|
||||
),
|
||||
@ -1625,6 +1626,7 @@ tf_cuda_library(
|
||||
"//tensorflow/core/platform/default/build_config:platformlib",
|
||||
"//tensorflow/core/profiler/lib:annotated_traceme",
|
||||
"//tensorflow/core/profiler/lib:traceme",
|
||||
"//tensorflow/core/util:abstract_stack_trace",
|
||||
"//tensorflow/core/util:einsum_op_util",
|
||||
"//tensorflow/core/util:padding",
|
||||
"//tensorflow/core/util:port",
|
||||
|
@ -664,7 +664,8 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
|
||||
TF_RETURN_IF_ERROR(op->TensorHandleInputs(&inputs));
|
||||
ExecuteNode node(&ctx, *inputs, remote_func_params, kernel, graph_collector,
|
||||
op->GetCancellationManager(),
|
||||
{retvals, static_cast<size_t>(num_outputs)});
|
||||
{retvals, static_cast<size_t>(num_outputs)},
|
||||
op->GetStackTrace());
|
||||
Status s = executor.SyncExecute(&node);
|
||||
// We release the inputs AFTER executing the operation in sync mode since
|
||||
// ExecuteNode does not increment the reference count and thus does not have
|
||||
@ -1106,7 +1107,8 @@ Status EagerKernelExecute(
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
||||
absl::Span<TensorHandle*> retvals) {
|
||||
absl::Span<TensorHandle*> retvals,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||
profiler::TraceMe activity("EagerKernelExecute",
|
||||
profiler::TraceMeLevel::kInfo);
|
||||
std::vector<EagerKernelRet> outputs(1);
|
||||
@ -1121,7 +1123,8 @@ Status EagerKernelExecute(
|
||||
// acquires a lock) and we can't recover from errors anyway.
|
||||
ScopedStepContainer* container = ctx->StepContainer();
|
||||
TF_RETURN_IF_ERROR(kernel->Run(container, inputs, &outputs,
|
||||
cancellation_manager, remote_func_params));
|
||||
cancellation_manager, remote_func_params,
|
||||
stack_trace));
|
||||
if (graph_collector != nullptr) {
|
||||
CollectGraphs(ctx);
|
||||
}
|
||||
|
@ -52,7 +52,8 @@ Status EagerKernelExecute(
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||
GraphCollector* graph_collector, CancellationManager* cancellation_manager,
|
||||
absl::Span<TensorHandle*> retvals);
|
||||
absl::Span<TensorHandle*> retvals,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace = {});
|
||||
|
||||
// Low-level utility to copy a tensor handle from one device to another. If
|
||||
// successful, result TensorHandle will be populated. If the caller requests for
|
||||
|
@ -97,7 +97,8 @@ class ExecuteNode : public EagerNode {
|
||||
const core::RefCountPtr<KernelAndDevice>& kernel,
|
||||
GraphCollector* graph_collector,
|
||||
CancellationManager* cancellation_manager,
|
||||
absl::Span<TensorHandle*> retvals)
|
||||
absl::Span<TensorHandle*> retvals,
|
||||
absl::optional<ManagedStackTrace> stack_trace)
|
||||
: EagerNode(),
|
||||
ctx_(ctx),
|
||||
inputs_(inputs),
|
||||
@ -105,7 +106,8 @@ class ExecuteNode : public EagerNode {
|
||||
kernel_(kernel),
|
||||
graph_collector_(graph_collector),
|
||||
cancellation_manager_(cancellation_manager),
|
||||
retvals_(retvals) {}
|
||||
retvals_(retvals),
|
||||
stack_trace_(stack_trace) {}
|
||||
|
||||
Status Run() override {
|
||||
int i = 0;
|
||||
@ -120,8 +122,8 @@ class ExecuteNode : public EagerNode {
|
||||
++i;
|
||||
}
|
||||
return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
|
||||
graph_collector_, cancellation_manager_,
|
||||
retvals_);
|
||||
graph_collector_, cancellation_manager_, retvals_,
|
||||
stack_trace_);
|
||||
}
|
||||
|
||||
void Abort(Status status) override {}
|
||||
@ -140,6 +142,7 @@ class ExecuteNode : public EagerNode {
|
||||
GraphCollector* graph_collector_;
|
||||
CancellationManager* const cancellation_manager_;
|
||||
absl::Span<TensorHandle*> retvals_;
|
||||
absl::optional<ManagedStackTrace> stack_trace_;
|
||||
};
|
||||
|
||||
class AsyncExecuteNode : public EagerNode {
|
||||
@ -198,7 +201,7 @@ class AsyncExecuteNode : public EagerNode {
|
||||
}
|
||||
Status status = EagerKernelExecute(
|
||||
ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
|
||||
cancellation_manager_, absl::MakeSpan(retvals_));
|
||||
cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
|
||||
if (!status.ok()) {
|
||||
if (stack_trace_.has_value()) {
|
||||
status = Status(status.code(), status.error_message(),
|
||||
|
@ -243,7 +243,8 @@ Status KernelAndDeviceOp::Run(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||
OpKernelContext::Params params;
|
||||
params.device = device_;
|
||||
params.frame_iter = FrameAndIter(0, 0);
|
||||
@ -255,6 +256,7 @@ Status KernelAndDeviceOp::Run(
|
||||
params.function_library = flr_;
|
||||
params.slice_reader_cache = &slice_reader_cache_;
|
||||
params.rendezvous = rendezvous_;
|
||||
params.stack_trace = stack_trace;
|
||||
OpExecutionState* op_execution_state = nullptr;
|
||||
|
||||
CancellationManager default_cancellation_manager;
|
||||
@ -320,7 +322,8 @@ Status KernelAndDeviceFunc::Run(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) {
|
||||
Notification n;
|
||||
Status status;
|
||||
RunAsync(step_container, inputs, outputs, cancellation_manager,
|
||||
|
@ -40,6 +40,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/gtl/inlined_vector.h"
|
||||
#include "tensorflow/core/platform/fingerprint.h"
|
||||
#include "tensorflow/core/util/managed_stack_trace.h"
|
||||
#include "tensorflow/core/util/tensor_slice_reader_cache.h"
|
||||
#if !defined(IS_MOBILE_PLATFORM)
|
||||
#include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
|
||||
@ -131,7 +132,8 @@ class KernelAndDevice : public core::RefCounted {
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params) = 0;
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) = 0;
|
||||
|
||||
// Execute kernel asynchronously when applicable. Different from `Run` which
|
||||
// blocks the caller thread and waits for the execution of the op/function,
|
||||
@ -203,11 +205,12 @@ class KernelAndDeviceOp final : public KernelAndDevice {
|
||||
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
||||
GraphCollector* graph_collector) override;
|
||||
|
||||
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>&
|
||||
remote_func_params) override;
|
||||
Status Run(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) override;
|
||||
|
||||
void RunAsync(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
@ -217,7 +220,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
|
||||
StatusCallback done) override {
|
||||
// Trivial async implementation on top of the sync version
|
||||
done(Run(step_container, inputs, outputs, cancellation_manager,
|
||||
remote_func_params));
|
||||
remote_func_params, {}));
|
||||
}
|
||||
|
||||
const OpKernel* kernel() const override { return kernel_.get(); }
|
||||
@ -291,11 +294,12 @@ class KernelAndDeviceFunc : public KernelAndDevice {
|
||||
Status Init(const bool log_device_placement, const NodeDef& ndef,
|
||||
GraphCollector* graph_collector) override;
|
||||
|
||||
Status Run(ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>&
|
||||
remote_func_params) override;
|
||||
Status Run(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
std::vector<EagerKernelRet>* outputs,
|
||||
CancellationManager* cancellation_manager,
|
||||
const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
|
||||
const absl::optional<ManagedStackTrace>& stack_trace) override;
|
||||
|
||||
void RunAsync(
|
||||
ScopedStepContainer* step_container, const EagerKernelArgs& inputs,
|
||||
|
@ -139,7 +139,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
|
||||
TF_CHECK_OK(k.Init({}, ndef, nullptr));
|
||||
const EagerKernelArgs args(std::move(inputs));
|
||||
for (auto s : state) {
|
||||
TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
|
||||
TF_CHECK_OK(
|
||||
k.Run(nullptr, args, &outputs, nullptr, absl::nullopt, absl::nullopt));
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_KernelAndDeviceRun);
|
||||
|
@ -980,7 +980,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
|
||||
|
||||
TF_ASSERT_OK(kernel->Run(/*step_container=*/nullptr, inputs, &outputs,
|
||||
/*cancellation_manager=*/nullptr,
|
||||
/*remote_func_params=*/absl::nullopt));
|
||||
/*remote_func_params=*/absl::nullopt,
|
||||
/*stack_trace=*/absl::nullopt));
|
||||
|
||||
CheckOutputsAndClose(outputs, op_id);
|
||||
}
|
||||
|
@ -110,7 +110,8 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
|
||||
|
||||
return kernel->Run(/*step_container=*/nullptr, args, /*outputs=*/nullptr,
|
||||
/*cancellation_manager=*/nullptr,
|
||||
/*remote_func_params=*/absl::nullopt);
|
||||
/*remote_func_params=*/absl::nullopt,
|
||||
/*stack_trace=*/absl::nullopt);
|
||||
}
|
||||
|
||||
void RemoteCopyNode::StartSend() {
|
||||
@ -195,7 +196,8 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
|
||||
std::vector<EagerKernelRet> rets;
|
||||
TF_RETURN_IF_ERROR(kernel->Run(/*step_container*/ nullptr, args, &rets,
|
||||
captured_state_->recv_cancellation(),
|
||||
/*remote_func_params=*/absl::nullopt));
|
||||
/*remote_func_params=*/absl::nullopt,
|
||||
/*stack_trace=*/absl::nullopt));
|
||||
outputs->clear();
|
||||
for (const auto& ret : rets) {
|
||||
if (ret.index() == 0) {
|
||||
|
@ -54,6 +54,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/config.pb.h"
|
||||
#include "tensorflow/core/util/managed_stack_trace.h"
|
||||
|
||||
namespace Eigen {
|
||||
struct ThreadPoolDevice;
|
||||
@ -701,6 +702,8 @@ class OpKernelContext {
|
||||
std::function<void()> inc_num_deferred_ops_function;
|
||||
std::function<void()> dec_num_deferred_ops_function;
|
||||
|
||||
absl::optional<ManagedStackTrace> stack_trace = {};
|
||||
|
||||
// For implementing `OpKernelContext::output_required()`. If null, all
|
||||
// outputs are required.
|
||||
bool* outputs_required_array = nullptr;
|
||||
@ -717,6 +720,11 @@ class OpKernelContext {
|
||||
|
||||
const OpKernel& op_kernel() const { return *params_->op_kernel; }
|
||||
|
||||
// Stack trace of where the op was defined (if defined in eager mode).
|
||||
const absl::optional<ManagedStackTrace>& stack_trace() const {
|
||||
return params_->stack_trace;
|
||||
}
|
||||
|
||||
// Input/output signature.
|
||||
|
||||
int num_inputs() const { return params_->inputs->size(); }
|
||||
|
Loading…
Reference in New Issue
Block a user