Add annotations for memory region type, tensor data type and shape.
PiperOrigin-RevId: 305585689 Change-Id: I6fec53e29afa0f91e99351cc50d3d9128241d173
This commit is contained in:
parent
287cacfb99
commit
27058058e3
tensorflow/core
common_runtime
BUILDbase_collective_executor.ccbfc_allocator.cc
eager
gpu
hierarchical_tree_broadcaster.ccrendezvous_mgr.ccdistributed_runtime
framework
kernels
@ -429,6 +429,7 @@ cc_library(
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
":shared_counter",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
|
@ -142,8 +142,8 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
|
||||
|
||||
Tensor TempChunk(int i) const override {
|
||||
AllocationAttributes empty;
|
||||
auto op_annotation =
|
||||
ScopedMemoryDebugAnnotation("CollectiveAdapterImpl::TempChunk", 0);
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
"CollectiveAdapterImpl::TempChunk");
|
||||
return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
|
||||
}
|
||||
|
||||
|
@ -29,6 +29,7 @@ limitations under the License.
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
#include "tensorflow/core/platform/stacktrace.h"
|
||||
#endif
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
|
||||
@ -427,11 +428,13 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
// Dump the memory log for analysis.
|
||||
MaybeWriteMemoryMap();
|
||||
if (dump_log_on_failure) {
|
||||
LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying "
|
||||
<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
|
||||
<< " (rounded to " << rounded_bytes << ")"
|
||||
<< "requested by op " << pending_op_name
|
||||
<< "\nCurrent allocation summary follows.";
|
||||
LOG(WARNING)
|
||||
<< "Allocator (" << Name() << ") ran out of memory trying "
|
||||
<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
|
||||
<< " (rounded to " << rounded_bytes << ")"
|
||||
<< "requested by op "
|
||||
<< ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
|
||||
<< "\nCurrent allocation summary follows.";
|
||||
DumpMemoryLog(rounded_bytes);
|
||||
LOG(WARNING) << RenderOccupancy();
|
||||
}
|
||||
@ -453,6 +456,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
|
||||
memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
|
||||
BFCAllocator::Chunk* chunk =
|
||||
ChunkFromHandle(region_manager_.get_handle(chunk_ptr));
|
||||
const auto& annotation =
|
||||
ScopedMemoryDebugAnnotation::CurrentAnnotation();
|
||||
std::string tensor_shape = annotation.pending_shape
|
||||
? annotation.pending_shape->DebugString()
|
||||
: "";
|
||||
|
||||
return absl::StrCat(traceme_name, "#allocator_name=", name_,
|
||||
",bytes_reserved=", stats.bytes_reserved,
|
||||
@ -462,8 +470,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
|
||||
",requested_bytes=", chunk->requested_size,
|
||||
",allocation_bytes=", chunk->size,
|
||||
",addr=", reinterpret_cast<uint64>(chunk_ptr),
|
||||
",tf_op=", pending_op_name, ",id=", pending_step_id,
|
||||
"#");
|
||||
",tf_op=", annotation.pending_op_name,
|
||||
",id=", annotation.pending_step_id,
|
||||
",region_type=", annotation.pending_region_type,
|
||||
",data_type=", annotation.pending_data_type,
|
||||
",shape=", tensor_shape, "#");
|
||||
},
|
||||
traceme_level);
|
||||
}
|
||||
@ -516,17 +527,20 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
if (ShouldRecordOpName()) {
|
||||
if (pending_op_name != nullptr) {
|
||||
chunk->op_name = pending_op_name;
|
||||
const auto& annotation =
|
||||
ScopedMemoryDebugAnnotation::CurrentAnnotation();
|
||||
if (annotation.pending_op_name != nullptr) {
|
||||
chunk->op_name = annotation.pending_op_name;
|
||||
} else {
|
||||
LOG(INFO) << "missing pending_op_name for " << Name()
|
||||
<< " reading addr "
|
||||
<< static_cast<const void*>(&pending_op_name) << "\n"
|
||||
<< static_cast<const void*>(&annotation.pending_op_name)
|
||||
<< "\n"
|
||||
<< CurrentStackTrace();
|
||||
chunk->op_name = nullptr;
|
||||
}
|
||||
chunk->action_count = ++action_counter_;
|
||||
chunk->step_id = pending_step_id;
|
||||
chunk->step_id = annotation.pending_step_id;
|
||||
int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
|
||||
size_history_[slot] = stats_.bytes_in_use;
|
||||
}
|
||||
|
@ -50,8 +50,8 @@ class CopyToDeviceNode : public EagerNode {
|
||||
|
||||
Status Run() override {
|
||||
tensorflow::Tensor tensor;
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
pending_op_name ? pending_op_name : "eager::CopyToDeviceNode");
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
"eager::CopyToDeviceNode", "dynamic", tensor.dtype(), &tensor.shape());
|
||||
TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
|
||||
if (!async_ && mirror_) {
|
||||
return dst_->AddLocalMirror(std::move(tensor), dstd_);
|
||||
|
@ -374,7 +374,10 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
|
||||
// running without an explicitly requested device.
|
||||
Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
|
||||
int* num_retvals) {
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(op->op_name());
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
op->op_name(), op->remote_func_params().has_value()
|
||||
? op->remote_func_params().value().step_id.value_or(0)
|
||||
: 0);
|
||||
profiler::TraceMe activity(
|
||||
[&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
|
||||
profiler::TraceMeLevel::kInfo);
|
||||
|
@ -340,7 +340,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
|
||||
if (!scratch_) {
|
||||
DCHECK(stream_);
|
||||
size_t scratch_buffer_size = Eigen::kGpuScratchSize + sizeof(unsigned int);
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation("ScratchBuffer");
|
||||
ScopedMemoryDebugAnnotation op_annotation("ScratchBuffer");
|
||||
void* scratch_buffer = gpu_allocator_->AllocateRaw(
|
||||
Allocator::kAllocatorAlignment, scratch_buffer_size);
|
||||
if (scratch_buffer == nullptr) {
|
||||
@ -498,8 +498,8 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
|
||||
}
|
||||
}
|
||||
ScopedActivateExecutorContext scoped_activation{stream->parent()};
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
op_kernel->name_view().data(), context->step_id());
|
||||
ScopedMemoryDebugAnnotation op_annotation(op_kernel->name_view().data(),
|
||||
context->step_id());
|
||||
op_kernel->Compute(context);
|
||||
if (context->status().ok()) {
|
||||
if (sync_every_op_) {
|
||||
@ -612,8 +612,6 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
|
||||
Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
|
||||
const AllocatorAttributes alloc_attrs,
|
||||
Tensor* tensor) {
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
(pending_op_name != nullptr ? pending_op_name : "MakeTensorFromProto"));
|
||||
AllocatorAttributes attr;
|
||||
attr.set_on_host(true);
|
||||
attr.set_gpu_compatible(true);
|
||||
@ -624,6 +622,8 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
|
||||
tensor_proto.DebugString());
|
||||
}
|
||||
|
||||
ScopedMemoryDebugAnnotation op_annotation("MakeTensorFromProto", "dynamic",
|
||||
parsed.dtype(), &parsed.shape());
|
||||
if (parsed.dtype() == DT_VARIANT) {
|
||||
const Variant* from = parsed.flat<Variant>().data();
|
||||
int numa_node = attributes().locality().numa_node();
|
||||
|
@ -409,8 +409,9 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
|
||||
int src_rank,
|
||||
const Tensor* src_tensor,
|
||||
const StatusCallback& done) {
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
col_ctx_->op_ctx->op_kernel().name_view().data());
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
col_ctx_->op_ctx->op_kernel().name_view().data(), col_ctx_->step_id,
|
||||
"dynamic", src_tensor->dtype(), &src_tensor->shape());
|
||||
string send_buf_key =
|
||||
BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
|
||||
int dst_idx =
|
||||
|
@ -74,7 +74,8 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
|
||||
return;
|
||||
}
|
||||
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone");
|
||||
ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", 0, "dynamic",
|
||||
in.dtype(), &in.shape());
|
||||
AllocatorAttributes attr = recv_args.alloc_attrs;
|
||||
attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
|
||||
recv_args.alloc_attrs.gpu_compatible());
|
||||
@ -112,7 +113,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
|
||||
RendezvousInterface::DoneCallback done) {
|
||||
VLOG(1) << "IntraProcessRendezvous Recv " << local << " " << parsed.FullKey();
|
||||
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync");
|
||||
ScopedMemoryDebugAnnotation op_annotation("RecvAsync");
|
||||
// Recv the tensor from local_.
|
||||
local->RecvAsync(
|
||||
parsed, recv_args,
|
||||
|
@ -272,9 +272,8 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
|
||||
return;
|
||||
}
|
||||
|
||||
// Note that it would be nice to cache the step_id here, but it's not
|
||||
// available.
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone", 0);
|
||||
ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", step_id_,
|
||||
"dynamic", in.dtype(), &in.shape());
|
||||
AllocatorAttributes attr = recv_args.alloc_attrs;
|
||||
attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
|
||||
recv_args.alloc_attrs.gpu_compatible());
|
||||
@ -323,7 +322,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
|
||||
DCHECK(is_initialized()) << "RecvAsync called when uninitialized (key: "
|
||||
<< parsed.FullKey() << ").";
|
||||
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync", 0);
|
||||
ScopedMemoryDebugAnnotation op_annotation("RecvAsync", step_id_);
|
||||
// Are src and dst in the same worker?
|
||||
if (IsSameWorker(parsed.src, parsed.dst)) {
|
||||
// Recv the tensor from local_.
|
||||
|
@ -129,9 +129,10 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
|
||||
}
|
||||
AllocatorAttributes cpu_attr;
|
||||
cpu_attr.set_gpu_compatible(true);
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
"CollectiveRemoteAccessDistributed::RecvFromPeer"
|
||||
"::recv_buf_callback");
|
||||
"::recv_buf_callback",
|
||||
step_id_, "dynamic", to_tensor->dtype(), &to_tensor->shape());
|
||||
Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
|
||||
to_tensor->dtype(), to_tensor->shape());
|
||||
PopulateTensorFromExtra(extra, cpu_tensor);
|
||||
|
@ -669,8 +669,9 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
|
||||
AllocatorAttributes cpu_attr;
|
||||
cpu_attr.set_gpu_compatible(true);
|
||||
cpu_attr.set_nic_compatible(true);
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(
|
||||
"GrpcWorker::RecvBufAsync::consumer_callback");
|
||||
ScopedMemoryDebugAnnotation op_annotation(
|
||||
"GrpcWorker::RecvBufAsync::consumer_callback", request->step_id(),
|
||||
"dynamic", hook->prod_value->dtype(), &hook->prod_value->shape());
|
||||
Tensor* cpu_tensor =
|
||||
new Tensor(cpu_dev->GetAllocator(cpu_attr),
|
||||
hook->prod_value->dtype(), hook->prod_value->shape());
|
||||
|
@ -27,8 +27,7 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
thread_local const char* pending_op_name = nullptr;
|
||||
thread_local int64 pending_step_id = 0;
|
||||
thread_local MemoryDebugAnnotation ScopedMemoryDebugAnnotation::annotation_;
|
||||
|
||||
string AllocatorStats::DebugString() const {
|
||||
return strings::Printf(
|
||||
|
@ -32,6 +32,8 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
class TensorShape;
|
||||
|
||||
// Attributes for a single allocation call. Different calls to the same
|
||||
// allocator could potentially have different allocation attributes.
|
||||
struct AllocationAttributes {
|
||||
@ -62,31 +64,80 @@ struct AllocationAttributes {
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
|
||||
};
|
||||
|
||||
// The runtime will cache Op names in thread-local memory and some allocators
|
||||
// will try to tag allocations with the requesting Op.
|
||||
extern thread_local const char* pending_op_name;
|
||||
extern thread_local int64 pending_step_id;
|
||||
// Annotations for memory profiling and debugging purpose. The runtime will
|
||||
// cache the annotations in thread-local memory, and some allocators will try to
|
||||
// tag allocations with the annotations.
|
||||
struct MemoryDebugAnnotation {
|
||||
const char* pending_op_name = nullptr;
|
||||
int64 pending_step_id = 0;
|
||||
const char* pending_region_type = nullptr;
|
||||
int32 pending_data_type = 0;
|
||||
const TensorShape* pending_shape = nullptr;
|
||||
};
|
||||
|
||||
// Wrapper class of pending_op_name and pending_step_id for RAII.
|
||||
// Wrapper class of MemoryDebugAnnotation for RAII.
|
||||
class ScopedMemoryDebugAnnotation {
|
||||
public:
|
||||
static const MemoryDebugAnnotation& CurrentAnnotation() {
|
||||
return annotation_;
|
||||
}
|
||||
|
||||
explicit ScopedMemoryDebugAnnotation(const char* op_name) {
|
||||
last_op_name_ = pending_op_name;
|
||||
pending_op_name = op_name;
|
||||
last_annotation_ = annotation_;
|
||||
CleanupAnnotation();
|
||||
annotation_.pending_op_name = op_name;
|
||||
}
|
||||
|
||||
explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id) {
|
||||
last_op_name_ = pending_op_name;
|
||||
pending_op_name = op_name;
|
||||
pending_step_id = step_id;
|
||||
last_annotation_ = annotation_;
|
||||
CleanupAnnotation();
|
||||
annotation_.pending_op_name = op_name;
|
||||
annotation_.pending_step_id = step_id;
|
||||
}
|
||||
|
||||
~ScopedMemoryDebugAnnotation() { pending_op_name = last_op_name_; }
|
||||
// This constructor keeps the pending_op_name and pending_step_id from parent
|
||||
// (if any). Otherwise it overwrites with op_name.
|
||||
explicit ScopedMemoryDebugAnnotation(const char* op_name,
|
||||
const char* region_type, int32 data_type,
|
||||
const TensorShape* shape) {
|
||||
last_annotation_ = annotation_;
|
||||
if (!annotation_.pending_op_name) {
|
||||
annotation_.pending_op_name = op_name;
|
||||
}
|
||||
annotation_.pending_region_type = region_type;
|
||||
annotation_.pending_data_type = data_type;
|
||||
annotation_.pending_shape = shape;
|
||||
}
|
||||
|
||||
explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id,
|
||||
const char* region_type, int32 data_type,
|
||||
const TensorShape* shape) {
|
||||
last_annotation_ = annotation_;
|
||||
annotation_.pending_op_name = op_name;
|
||||
annotation_.pending_step_id = step_id;
|
||||
annotation_.pending_region_type = region_type;
|
||||
annotation_.pending_data_type = data_type;
|
||||
annotation_.pending_shape = shape;
|
||||
}
|
||||
|
||||
~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; }
|
||||
|
||||
private:
|
||||
// Stores the previous value of pending_op_name in case the annotations are
|
||||
// nested.
|
||||
const char* last_op_name_ = nullptr;
|
||||
void CleanupAnnotation() {
|
||||
annotation_.pending_op_name = nullptr;
|
||||
annotation_.pending_step_id = 0;
|
||||
annotation_.pending_region_type = nullptr;
|
||||
annotation_.pending_data_type = 0;
|
||||
annotation_.pending_shape = nullptr;
|
||||
}
|
||||
|
||||
// Stores the current annotations.
|
||||
static thread_local MemoryDebugAnnotation annotation_;
|
||||
|
||||
// Stores the previous values in case the annotations are nested.
|
||||
MemoryDebugAnnotation last_annotation_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation);
|
||||
};
|
||||
|
||||
// Runtime statistics collected by an allocator. Exactly the same as
|
||||
|
@ -703,8 +703,6 @@ Status OpKernelContext::allocate_tensor(
|
||||
DataType type, const TensorShape& shape, Tensor* out_tensor,
|
||||
AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
|
||||
Allocator* a = get_allocator(attr);
|
||||
auto op_annotation =
|
||||
ScopedMemoryDebugAnnotation(op_kernel().name_view().data(), step_id());
|
||||
Tensor new_tensor(a, type, shape,
|
||||
AllocationAttributes(allocation_attr.no_retry_on_failure,
|
||||
/* allocation_will_be_logged= */ true,
|
||||
@ -758,6 +756,8 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
|
||||
" more than once. Try turning off the ScopedAllocator optimizer.");
|
||||
}
|
||||
}
|
||||
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
|
||||
step_id(), "output", type, &shape);
|
||||
auto output_tensor = MakeUnique<Tensor>();
|
||||
Status s = allocate_tensor(type, shape, output_tensor.get(), attr);
|
||||
if (s.ok()) {
|
||||
@ -787,6 +787,8 @@ Status OpKernelContext::allocate_temp(
|
||||
<< ". Switch to allocate_output to avoid performance penalty.";
|
||||
allocator_attr.scope_id = -1;
|
||||
}
|
||||
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
|
||||
step_id(), "temp", type, &shape);
|
||||
Status s =
|
||||
allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
|
||||
if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
|
||||
@ -815,6 +817,8 @@ Status OpKernelContext::allocate_persistent(DataType type,
|
||||
return errors::Internal(
|
||||
"Unexpected call to allocate_persistent with scope_id ", attr.scope_id);
|
||||
}
|
||||
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
|
||||
step_id(), "persist", type, &shape);
|
||||
Tensor persistent;
|
||||
Status s = allocate_tensor(type, shape, &persistent, attr);
|
||||
if (s.ok()) {
|
||||
@ -921,6 +925,9 @@ bool OpKernelContext::maybe_set_output_by_allocate_and_copy(
|
||||
<< " params_->forward_from_array[index] "
|
||||
<< params_->forward_from_array[index] << " alloc_attr.scope_id "
|
||||
<< output_alloc_attr(index).scope_id;
|
||||
ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
|
||||
step_id(), "output",
|
||||
tensor.dtype(), &tensor.shape());
|
||||
auto new_tensor = MakeUnique<Tensor>();
|
||||
Status s = allocate_tensor(tensor.dtype(), tensor.shape(), new_tensor.get(),
|
||||
output_alloc_attr(index));
|
||||
|
@ -73,7 +73,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
|
||||
: OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false),
|
||||
tensor_(ctx->output_type(0)) {
|
||||
const TensorProto* proto = nullptr;
|
||||
auto op_annotation = ScopedMemoryDebugAnnotation(name_view().data());
|
||||
ScopedMemoryDebugAnnotation op_annotation(name_view().data());
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
|
||||
OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
|
||||
*proto, AllocatorAttributes(), &tensor_));
|
||||
|
Loading…
Reference in New Issue
Block a user