Propagate the frame where the resource var was created to resource var handles

Allows showing informative error messages, indicating where the resource var
was created.

PiperOrigin-RevId: 353729467
Change-Id: I90c19bfecc2d2df19523b28a0caeea2abe331fae
This commit is contained in:
George Karpenkov 2021-01-25 14:20:31 -08:00 committed by TensorFlower Gardener
parent ba088e0369
commit 1b0d50b051
14 changed files with 129 additions and 32 deletions

View File

@ -89,9 +89,25 @@ Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
Var* variable = nullptr;
ResourceHandle handle = inputs[var_idx]->flat<ResourceHandle>()(0);
if (handle.device() != dev->attributes().name()) {
return errors::InvalidArgument(
"Trying to access resource ", handle.name(), " located in device ",
handle.device(), " from device ", dev->attributes().name());
std::string definition_location = [&]() -> std::string {
if (handle.definition_stack_trace()) {
std::vector<StackFrame> stack_frames =
handle.definition_stack_trace()->ToStackFrames(
{}, IsInternalFrameForFilename,
/*reverse_traversal=*/true,
/*limit=*/1);
if (!stack_frames.empty()) {
const StackFrame& last_frame = stack_frames[0];
return absl::StrCat(" (defined @ ", last_frame.file_name, ":",
last_frame.line_number, ")");
}
}
return "";
}();
return errors::InvalidArgument("Trying to access resource ",
handle.name(), definition_location,
" located in device ", handle.device(),
" from device ", dev->attributes().name());
}
TF_RETURN_IF_ERROR(rm->LookupOrCreate<Var>(
handle.container(), handle.name(), &variable, [](Var** ptr) {

View File

@ -706,6 +706,7 @@ cc_library(
"//tensorflow/core/lib/strings:strcat",
"//tensorflow/core/platform:tensor_coding",
"//tensorflow/core/platform:types",
"//tensorflow/core/util:abstract_stack_trace",
],
alwayslink = 1,
)
@ -801,6 +802,7 @@ tf_cuda_library(
"//tensorflow/core/platform:tensor_coding",
"//tensorflow/core/platform:types",
"//tensorflow/core/public:version",
"//tensorflow/core/util:abstract_stack_trace",
"//third_party/eigen3",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/platform/tensor_coding.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/managed_stack_trace.h"
namespace tensorflow {
@ -71,6 +72,15 @@ class ResourceHandle {
dtypes_and_shapes_ = dtypes_and_shapes;
}
void set_definition_stack_trace(
const absl::optional<ManagedStackTrace>& definition_stack_trace) {
definition_stack_trace_ = definition_stack_trace;
}
const absl::optional<ManagedStackTrace>& definition_stack_trace() const {
return definition_stack_trace_;
}
// Conversion to and from ResourceHandleProto
void AsProto(ResourceHandleProto* proto) const;
void FromProto(const ResourceHandleProto& proto);
@ -93,6 +103,7 @@ class ResourceHandle {
uint64 hash_code_ = 0;
std::string maybe_type_name_;
std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes_;
absl::optional<ManagedStackTrace> definition_stack_trace_;
};
// For backwards compatibility for when this was a proto

View File

@ -36,10 +36,12 @@ static std::atomic<int64> current_id_;
ResourceHandle MakeResourceHandle(
const string& container, const string& name, const DeviceBase& device,
const TypeIndex& type_index,
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes) {
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes,
const absl::optional<ManagedStackTrace>& definition_stack_trace) {
ResourceHandle result;
result.set_device(device.name());
result.set_container(container);
result.set_definition_stack_trace(definition_stack_trace);
if (name == ResourceHandle::ANONYMOUS_NAME) {
result.set_name(strings::StrCat("_AnonymousVar", current_id_.fetch_add(1)));
} else {

View File

@ -296,28 +296,33 @@ class ResourceMgr {
ResourceHandle MakeResourceHandle(
const std::string& container, const std::string& name,
const DeviceBase& device, const TypeIndex& type_index,
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {})
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
const absl::optional<ManagedStackTrace>& definition_stack_trace = {})
TF_MUST_USE_RESULT;
template <typename T>
ResourceHandle MakeResourceHandle(
OpKernelContext* ctx, const std::string& container, const std::string& name,
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
return MakeResourceHandle(
container.empty() ? ctx->resource_manager()->default_container()
: container,
name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
return MakeResourceHandle(container.empty()
? ctx->resource_manager()->default_container()
: container,
name, *ctx->device(), TypeIndex::Make<T>(),
dtypes_and_shapes, definition_stack_trace);
}
template <typename T>
ResourceHandle MakeResourceHandle(
OpKernelConstruction* ctx, const std::string& container,
const std::string& name,
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {}) {
return MakeResourceHandle(
container.empty() ? ctx->resource_manager()->default_container()
: container,
name, *ctx->device(), TypeIndex::Make<T>(), dtypes_and_shapes);
const std::vector<DtypeAndPartialTensorShape>& dtypes_and_shapes = {},
const absl::optional<ManagedStackTrace>& definition_stack_trace = {}) {
return MakeResourceHandle(container.empty()
? ctx->resource_manager()->default_container()
: container,
name, *ctx->device(), TypeIndex::Make<T>(),
dtypes_and_shapes, definition_stack_trace);
}
Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,

View File

@ -245,7 +245,8 @@ void VarHandleOp::Compute(OpKernelContext* ctx) {
ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
handle.scalar<ResourceHandle>()() = MakeResourceHandle<Var>(
ctx, container_, name_,
std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_},
ctx->stack_trace());
ctx->set_output(0, handle);
} else {
ctx->set_output(0, resource_);

View File

@ -633,6 +633,11 @@ cc_library(
alwayslink = 1,
)
cc_library(
name = "stack_frame",
hdrs = ["stack_frame.h"],
)
cc_library(
name = "status",
srcs = ["status.cc"],
@ -641,6 +646,7 @@ cc_library(
":logging",
":macros",
":mutex",
":stack_frame",
":stacktrace",
":str_util",
":strcat",
@ -1385,6 +1391,7 @@ filegroup(
"ram_file_system.h",
"random.h",
"resource.h",
"stack_frame.h",
"stacktrace.h",
"stacktrace_handler.h",
"status.h",
@ -1633,6 +1640,7 @@ filegroup(
"snappy.h",
"stacktrace.h",
"status.cc",
"stack_frame.h",
"status.h",
"str_util.cc",
"str_util.h",

View File

@ -0,0 +1,39 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PLATFORM_STACK_TRACE_H_
#define TENSORFLOW_CORE_PLATFORM_STACK_TRACE_H_
#include <string>
namespace tensorflow {
// A struct representing a frame in a stack trace.
struct StackFrame {
std::string file_name;
int line_number;
std::string function_name;
bool operator==(const StackFrame& other) const {
return line_number == other.line_number &&
function_name == other.function_name && file_name == other.file_name;
}
bool operator!=(const StackFrame& other) const { return !(*this == other); }
};
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PLATFORM_STACK_TRACE_H_

View File

@ -24,26 +24,13 @@ limitations under the License.
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/stack_frame.h"
#include "tensorflow/core/platform/stringpiece.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/error_codes.pb.h"
namespace tensorflow {
// A struct representing a frame in a stack trace.
struct StackFrame {
std::string file_name;
int line_number;
std::string function_name;
bool operator==(const StackFrame& other) const {
return line_number == other.line_number &&
function_name == other.function_name && file_name == other.file_name;
}
bool operator!=(const StackFrame& other) const { return !(*this == other); }
};
#if defined(__clang__)
// Only clang supports warn_unused_result as a type annotation.
class TF_MUST_USE_RESULT Status;

View File

@ -474,14 +474,16 @@ cc_library(
"//tensorflow/c/eager:__pkg__",
"//tensorflow/core:__pkg__",
"//tensorflow/core/common_runtime/eager:__pkg__",
"//tensorflow/core/framework:__pkg__",
"//tensorflow/core/platform:__pkg__",
"//tensorflow/python:__pkg__",
"//tensorflow/python/eager:__pkg__",
"//tensorflow/python/util:__pkg__",
],
deps = [
"//tensorflow/core/platform:status",
"//tensorflow/core/platform:stack_frame",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:optional",
],
)

View File

@ -17,9 +17,11 @@ limitations under the License.
#define TENSORFLOW_CORE_UTIL_ABSTRACT_STACK_TRACE_H_
#include <string>
#include <vector>
#include "absl/strings/match.h"
#include "tensorflow/core/platform/status.h"
#include "absl/types/optional.h"
#include "tensorflow/core/platform/stack_frame.h"
namespace tensorflow {

View File

@ -944,6 +944,26 @@ class DefFunctionTest(xla_test.XLATestCase):
self.assertEqual(cell_nojit.value(), orig_nojit + 2)
self.assertEqual(cell_jit.value(), orig_jit + 3)
@test_util.disable_mlir_bridge('TODO(b/162272821): MLIR bridge returns '
' wrong status type')
def testResourceWrongDevice(self):
if 'gpu' not in self.device.lower():
self.skipTest('Need a GPU to have non-trivial device placement')
with ops.device('device:CPU:0'):
v = variables.Variable([3.1, 3.2])
with ops.device('device:{}:0'.format(self.device)):
@def_function.function(experimental_compile=True)
def update_var(a):
v.assign_add(a)
arg = random_ops.random_normal([2])
with self.assertRaisesRegex(errors.InvalidArgumentError,
'def_function_xla_jit_test.py'):
update_var(arg)
if __name__ == '__main__':
ops.enable_eager_execution()

View File

@ -341,6 +341,7 @@ cc_library(
srcs = ["stack_trace.cc"],
hdrs = ["stack_trace.h"],
deps = [
"//tensorflow/core/platform:status",
"//tensorflow/core/platform:str_util",
"//tensorflow/core/platform:stringpiece",
"//tensorflow/core/util:abstract_stack_trace",

View File

@ -30,6 +30,7 @@ limitations under the License.
#include "absl/container/flat_hash_set.h"
#include "absl/container/inlined_vector.h"
#include "absl/types/optional.h"
#include "tensorflow/core/platform/status.h"
#include "tensorflow/core/util/managed_stack_trace.h"
namespace tensorflow {