Add annotations for memory region type, tensor data type and shape.

PiperOrigin-RevId: 305585689
Change-Id: I6fec53e29afa0f91e99351cc50d3d9128241d173
This commit is contained in:
A. Unique TensorFlower 2020-04-08 17:11:37 -07:00 committed by TensorFlower Gardener
parent 287cacfb99
commit 27058058e3
15 changed files with 130 additions and 52 deletions

View File

@ -429,6 +429,7 @@ cc_library(
visibility = ["//visibility:public"],
deps = [
":shared_counter",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:protos_all_cc",

View File

@ -142,8 +142,8 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
Tensor TempChunk(int i) const override {
AllocationAttributes empty;
auto op_annotation =
ScopedMemoryDebugAnnotation("CollectiveAdapterImpl::TempChunk", 0);
ScopedMemoryDebugAnnotation op_annotation(
"CollectiveAdapterImpl::TempChunk");
return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
}

View File

@ -29,6 +29,7 @@ limitations under the License.
#ifdef TENSORFLOW_MEM_DEBUG
#include "tensorflow/core/platform/stacktrace.h"
#endif
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/lib/traceme.h"
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
@ -427,11 +428,13 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
// Dump the memory log for analysis.
MaybeWriteMemoryMap();
if (dump_log_on_failure) {
LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying "
<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
<< " (rounded to " << rounded_bytes << ")"
<< "requested by op " << pending_op_name
<< "\nCurrent allocation summary follows.";
LOG(WARNING)
<< "Allocator (" << Name() << ") ran out of memory trying "
<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
<< " (rounded to " << rounded_bytes << ")"
<< "requested by op "
<< ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
<< "\nCurrent allocation summary follows.";
DumpMemoryLog(rounded_bytes);
LOG(WARNING) << RenderOccupancy();
}
@ -453,6 +456,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
BFCAllocator::Chunk* chunk =
ChunkFromHandle(region_manager_.get_handle(chunk_ptr));
const auto& annotation =
ScopedMemoryDebugAnnotation::CurrentAnnotation();
std::string tensor_shape = annotation.pending_shape
? annotation.pending_shape->DebugString()
: "";
return absl::StrCat(traceme_name, "#allocator_name=", name_,
",bytes_reserved=", stats.bytes_reserved,
@ -462,8 +470,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
",requested_bytes=", chunk->requested_size,
",allocation_bytes=", chunk->size,
",addr=", reinterpret_cast<uint64>(chunk_ptr),
",tf_op=", pending_op_name, ",id=", pending_step_id,
"#");
",tf_op=", annotation.pending_op_name,
",id=", annotation.pending_step_id,
",region_type=", annotation.pending_region_type,
",data_type=", annotation.pending_data_type,
",shape=", tensor_shape, "#");
},
traceme_level);
}
@ -516,17 +527,20 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
#ifdef TENSORFLOW_MEM_DEBUG
if (ShouldRecordOpName()) {
if (pending_op_name != nullptr) {
chunk->op_name = pending_op_name;
const auto& annotation =
ScopedMemoryDebugAnnotation::CurrentAnnotation();
if (annotation.pending_op_name != nullptr) {
chunk->op_name = annotation.pending_op_name;
} else {
LOG(INFO) << "missing pending_op_name for " << Name()
<< " reading addr "
<< static_cast<const void*>(&pending_op_name) << "\n"
<< static_cast<const void*>(&annotation.pending_op_name)
<< "\n"
<< CurrentStackTrace();
chunk->op_name = nullptr;
}
chunk->action_count = ++action_counter_;
chunk->step_id = pending_step_id;
chunk->step_id = annotation.pending_step_id;
int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
size_history_[slot] = stats_.bytes_in_use;
}

View File

@ -50,8 +50,8 @@ class CopyToDeviceNode : public EagerNode {
Status Run() override {
tensorflow::Tensor tensor;
auto op_annotation = ScopedMemoryDebugAnnotation(
pending_op_name ? pending_op_name : "eager::CopyToDeviceNode");
ScopedMemoryDebugAnnotation op_annotation(
"eager::CopyToDeviceNode", "dynamic", tensor.dtype(), &tensor.shape());
TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
if (!async_ && mirror_) {
return dst_->AddLocalMirror(std::move(tensor), dstd_);

View File

@ -374,7 +374,10 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
// running without an explicitly requested device.
Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
int* num_retvals) {
auto op_annotation = ScopedMemoryDebugAnnotation(op->op_name());
ScopedMemoryDebugAnnotation op_annotation(
op->op_name(), op->remote_func_params().has_value()
? op->remote_func_params().value().step_id.value_or(0)
: 0);
profiler::TraceMe activity(
[&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
profiler::TraceMeLevel::kInfo);

View File

@ -340,7 +340,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
if (!scratch_) {
DCHECK(stream_);
size_t scratch_buffer_size = Eigen::kGpuScratchSize + sizeof(unsigned int);
auto op_annotation = ScopedMemoryDebugAnnotation("ScratchBuffer");
ScopedMemoryDebugAnnotation op_annotation("ScratchBuffer");
void* scratch_buffer = gpu_allocator_->AllocateRaw(
Allocator::kAllocatorAlignment, scratch_buffer_size);
if (scratch_buffer == nullptr) {
@ -498,8 +498,8 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
}
}
ScopedActivateExecutorContext scoped_activation{stream->parent()};
auto op_annotation = ScopedMemoryDebugAnnotation(
op_kernel->name_view().data(), context->step_id());
ScopedMemoryDebugAnnotation op_annotation(op_kernel->name_view().data(),
context->step_id());
op_kernel->Compute(context);
if (context->status().ok()) {
if (sync_every_op_) {
@ -612,8 +612,6 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
const AllocatorAttributes alloc_attrs,
Tensor* tensor) {
auto op_annotation = ScopedMemoryDebugAnnotation(
(pending_op_name != nullptr ? pending_op_name : "MakeTensorFromProto"));
AllocatorAttributes attr;
attr.set_on_host(true);
attr.set_gpu_compatible(true);
@ -624,6 +622,8 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
tensor_proto.DebugString());
}
ScopedMemoryDebugAnnotation op_annotation("MakeTensorFromProto", "dynamic",
parsed.dtype(), &parsed.shape());
if (parsed.dtype() == DT_VARIANT) {
const Variant* from = parsed.flat<Variant>().data();
int numa_node = attributes().locality().numa_node();

View File

@ -409,8 +409,9 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
int src_rank,
const Tensor* src_tensor,
const StatusCallback& done) {
auto op_annotation = ScopedMemoryDebugAnnotation(
col_ctx_->op_ctx->op_kernel().name_view().data());
ScopedMemoryDebugAnnotation op_annotation(
col_ctx_->op_ctx->op_kernel().name_view().data(), col_ctx_->step_id,
"dynamic", src_tensor->dtype(), &src_tensor->shape());
string send_buf_key =
BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
int dst_idx =

View File

@ -74,7 +74,8 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
return;
}
auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone");
ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", 0, "dynamic",
in.dtype(), &in.shape());
AllocatorAttributes attr = recv_args.alloc_attrs;
attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
recv_args.alloc_attrs.gpu_compatible());
@ -112,7 +113,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
RendezvousInterface::DoneCallback done) {
VLOG(1) << "IntraProcessRendezvous Recv " << local << " " << parsed.FullKey();
auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync");
ScopedMemoryDebugAnnotation op_annotation("RecvAsync");
// Recv the tensor from local_.
local->RecvAsync(
parsed, recv_args,

View File

@ -272,9 +272,8 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
return;
}
// Note that it would be nice to cache the step_id here, but it's not
// available.
auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone", 0);
ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", step_id_,
"dynamic", in.dtype(), &in.shape());
AllocatorAttributes attr = recv_args.alloc_attrs;
attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
recv_args.alloc_attrs.gpu_compatible());
@ -323,7 +322,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
DCHECK(is_initialized()) << "RecvAsync called when uninitialized (key: "
<< parsed.FullKey() << ").";
auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync", 0);
ScopedMemoryDebugAnnotation op_annotation("RecvAsync", step_id_);
// Are src and dst in the same worker?
if (IsSameWorker(parsed.src, parsed.dst)) {
// Recv the tensor from local_.

View File

@ -129,9 +129,10 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
}
AllocatorAttributes cpu_attr;
cpu_attr.set_gpu_compatible(true);
auto op_annotation = ScopedMemoryDebugAnnotation(
ScopedMemoryDebugAnnotation op_annotation(
"CollectiveRemoteAccessDistributed::RecvFromPeer"
"::recv_buf_callback");
"::recv_buf_callback",
step_id_, "dynamic", to_tensor->dtype(), &to_tensor->shape());
Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
to_tensor->dtype(), to_tensor->shape());
PopulateTensorFromExtra(extra, cpu_tensor);

View File

@ -669,8 +669,9 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
AllocatorAttributes cpu_attr;
cpu_attr.set_gpu_compatible(true);
cpu_attr.set_nic_compatible(true);
auto op_annotation = ScopedMemoryDebugAnnotation(
"GrpcWorker::RecvBufAsync::consumer_callback");
ScopedMemoryDebugAnnotation op_annotation(
"GrpcWorker::RecvBufAsync::consumer_callback", request->step_id(),
"dynamic", hook->prod_value->dtype(), &hook->prod_value->shape());
Tensor* cpu_tensor =
new Tensor(cpu_dev->GetAllocator(cpu_attr),
hook->prod_value->dtype(), hook->prod_value->shape());

View File

@ -27,8 +27,7 @@ limitations under the License.
namespace tensorflow {
thread_local const char* pending_op_name = nullptr;
thread_local int64 pending_step_id = 0;
thread_local MemoryDebugAnnotation ScopedMemoryDebugAnnotation::annotation_;
string AllocatorStats::DebugString() const {
return strings::Printf(

View File

@ -32,6 +32,8 @@ limitations under the License.
namespace tensorflow {
class TensorShape;
// Attributes for a single allocation call. Different calls to the same
// allocator could potentially have different allocation attributes.
struct AllocationAttributes {
@ -62,31 +64,80 @@ struct AllocationAttributes {
TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
};
// The runtime will cache Op names in thread-local memory and some allocators
// will try to tag allocations with the requesting Op.
extern thread_local const char* pending_op_name;
extern thread_local int64 pending_step_id;
// Annotations for memory profiling and debugging purpose. The runtime will
// cache the annotations in thread-local memory, and some allocators will try to
// tag allocations with the annotations.
struct MemoryDebugAnnotation {
const char* pending_op_name = nullptr;
int64 pending_step_id = 0;
const char* pending_region_type = nullptr;
int32 pending_data_type = 0;
const TensorShape* pending_shape = nullptr;
};
// Wrapper class of pending_op_name and pending_step_id for RAII.
// Wrapper class of MemoryDebugAnnotation for RAII.
class ScopedMemoryDebugAnnotation {
public:
static const MemoryDebugAnnotation& CurrentAnnotation() {
return annotation_;
}
explicit ScopedMemoryDebugAnnotation(const char* op_name) {
last_op_name_ = pending_op_name;
pending_op_name = op_name;
last_annotation_ = annotation_;
CleanupAnnotation();
annotation_.pending_op_name = op_name;
}
explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id) {
last_op_name_ = pending_op_name;
pending_op_name = op_name;
pending_step_id = step_id;
last_annotation_ = annotation_;
CleanupAnnotation();
annotation_.pending_op_name = op_name;
annotation_.pending_step_id = step_id;
}
~ScopedMemoryDebugAnnotation() { pending_op_name = last_op_name_; }
// This constructor keeps the pending_op_name and pending_step_id from parent
// (if any). Otherwise it overwrites with op_name.
explicit ScopedMemoryDebugAnnotation(const char* op_name,
const char* region_type, int32 data_type,
const TensorShape* shape) {
last_annotation_ = annotation_;
if (!annotation_.pending_op_name) {
annotation_.pending_op_name = op_name;
}
annotation_.pending_region_type = region_type;
annotation_.pending_data_type = data_type;
annotation_.pending_shape = shape;
}
explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id,
const char* region_type, int32 data_type,
const TensorShape* shape) {
last_annotation_ = annotation_;
annotation_.pending_op_name = op_name;
annotation_.pending_step_id = step_id;
annotation_.pending_region_type = region_type;
annotation_.pending_data_type = data_type;
annotation_.pending_shape = shape;
}
~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; }
private:
// Stores the previous value of pending_op_name in case the annotations are
// nested.
const char* last_op_name_ = nullptr;
void CleanupAnnotation() {
annotation_.pending_op_name = nullptr;
annotation_.pending_step_id = 0;
annotation_.pending_region_type = nullptr;
annotation_.pending_data_type = 0;
annotation_.pending_shape = nullptr;
}
// Stores the current annotations.
static thread_local MemoryDebugAnnotation annotation_;
// Stores the previous values in case the annotations are nested.
MemoryDebugAnnotation last_annotation_;
TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation);
};
// Runtime statistics collected by an allocator. Exactly the same as

View File

@ -703,8 +703,6 @@ Status OpKernelContext::allocate_tensor(
DataType type, const TensorShape& shape, Tensor* out_tensor,
AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
Allocator* a = get_allocator(attr);
auto op_annotation =
ScopedMemoryDebugAnnotation(op_kernel().name_view().data(), step_id());
Tensor new_tensor(a, type, shape,
AllocationAttributes(allocation_attr.no_retry_on_failure,
/* allocation_will_be_logged= */ true,
@ -758,6 +756,8 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
" more than once. Try turning off the ScopedAllocator optimizer.");
}
}
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
step_id(), "output", type, &shape);
auto output_tensor = MakeUnique<Tensor>();
Status s = allocate_tensor(type, shape, output_tensor.get(), attr);
if (s.ok()) {
@ -787,6 +787,8 @@ Status OpKernelContext::allocate_temp(
<< ". Switch to allocate_output to avoid performance penalty.";
allocator_attr.scope_id = -1;
}
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
step_id(), "temp", type, &shape);
Status s =
allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
@ -815,6 +817,8 @@ Status OpKernelContext::allocate_persistent(DataType type,
return errors::Internal(
"Unexpected call to allocate_persistent with scope_id ", attr.scope_id);
}
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
step_id(), "persist", type, &shape);
Tensor persistent;
Status s = allocate_tensor(type, shape, &persistent, attr);
if (s.ok()) {
@ -921,6 +925,9 @@ bool OpKernelContext::maybe_set_output_by_allocate_and_copy(
<< " params_->forward_from_array[index] "
<< params_->forward_from_array[index] << " alloc_attr.scope_id "
<< output_alloc_attr(index).scope_id;
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
step_id(), "output",
tensor.dtype(), &tensor.shape());
auto new_tensor = MakeUnique<Tensor>();
Status s = allocate_tensor(tensor.dtype(), tensor.shape(), new_tensor.get(),
output_alloc_attr(index));

View File

@ -73,7 +73,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
: OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false),
tensor_(ctx->output_type(0)) {
const TensorProto* proto = nullptr;
auto op_annotation = ScopedMemoryDebugAnnotation(name_view().data());
ScopedMemoryDebugAnnotation op_annotation(name_view().data());
OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
*proto, AllocatorAttributes(), &tensor_));