Add annotations for memory region type, tensor data type and shape.

PiperOrigin-RevId: 305585689 Change-Id: I6fec53e29afa0f91e99351cc50d3d9128241d173
2020-04-08 17:11:37 -07:00 · 2020-04-08 17:11:37 -07:00 · 27058058e3
commit 27058058e3
parent 287cacfb99
15 changed files with 130 additions and 52 deletions
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@ -429,6 +429,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":shared_counter",
+        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@ -142,8 +142,8 @@ class CollectiveAdapterImpl : public CollectiveAdapter {

  Tensor TempChunk(int i) const override {
    AllocationAttributes empty;
-    auto op_annotation =
-        ScopedMemoryDebugAnnotation("CollectiveAdapterImpl::TempChunk", 0);
+    ScopedMemoryDebugAnnotation op_annotation(
+        "CollectiveAdapterImpl::TempChunk");
    return Tensor(allocator_, dt_, {ChunkElts(i)}, empty);
  }

--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -29,6 +29,7 @@ limitations under the License.
 #ifdef TENSORFLOW_MEM_DEBUG
 #include "tensorflow/core/platform/stacktrace.h"
 #endif
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
@ -427,11 +428,13 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
  // Dump the memory log for analysis.
  MaybeWriteMemoryMap();
  if (dump_log_on_failure) {
-    LOG(WARNING) << "Allocator (" << Name() << ") ran out of memory trying "
-                 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
-                 << " (rounded to " << rounded_bytes << ")"
-                 << "requested by op " << pending_op_name
-                 << "\nCurrent allocation summary follows.";
+    LOG(WARNING)
+        << "Allocator (" << Name() << ") ran out of memory trying "
+        << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
+        << " (rounded to " << rounded_bytes << ")"
+        << "requested by op "
+        << ScopedMemoryDebugAnnotation::CurrentAnnotation().pending_op_name
+        << "\nCurrent allocation summary follows.";
    DumpMemoryLog(rounded_bytes);
    LOG(WARNING) << RenderOccupancy();
  }
@ -453,6 +456,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
            memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
        BFCAllocator::Chunk* chunk =
            ChunkFromHandle(region_manager_.get_handle(chunk_ptr));
+        const auto& annotation =
+            ScopedMemoryDebugAnnotation::CurrentAnnotation();
+        std::string tensor_shape = annotation.pending_shape
+                                       ? annotation.pending_shape->DebugString()
+                                       : "";

        return absl::StrCat(traceme_name, "#allocator_name=", name_,
                            ",bytes_reserved=", stats.bytes_reserved,
@ -462,8 +470,11 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                            ",requested_bytes=", chunk->requested_size,
                            ",allocation_bytes=", chunk->size,
                            ",addr=", reinterpret_cast<uint64>(chunk_ptr),
-                            ",tf_op=", pending_op_name, ",id=", pending_step_id,
-                            "#");
+                            ",tf_op=", annotation.pending_op_name,
+                            ",id=", annotation.pending_step_id,
+                            ",region_type=", annotation.pending_region_type,
+                            ",data_type=", annotation.pending_data_type,
+                            ",shape=", tensor_shape, "#");
      },
      traceme_level);
 }
@ -516,17 +527,20 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,

 #ifdef TENSORFLOW_MEM_DEBUG
        if (ShouldRecordOpName()) {
-          if (pending_op_name != nullptr) {
-            chunk->op_name = pending_op_name;
+          const auto& annotation =
+              ScopedMemoryDebugAnnotation::CurrentAnnotation();
+          if (annotation.pending_op_name != nullptr) {
+            chunk->op_name = annotation.pending_op_name;
          } else {
            LOG(INFO) << "missing pending_op_name for " << Name()
                      << " reading addr "
-                      << static_cast<const void*>(&pending_op_name) << "\n"
+                      << static_cast<const void*>(&annotation.pending_op_name)
+                      << "\n"
                      << CurrentStackTrace();
            chunk->op_name = nullptr;
          }
          chunk->action_count = ++action_counter_;
-          chunk->step_id = pending_step_id;
+          chunk->step_id = annotation.pending_step_id;
          int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
          size_history_[slot] = stats_.bytes_in_use;
        }
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@ -50,8 +50,8 @@ class CopyToDeviceNode : public EagerNode {

  Status Run() override {
    tensorflow::Tensor tensor;
-    auto op_annotation = ScopedMemoryDebugAnnotation(
-        pending_op_name ? pending_op_name : "eager::CopyToDeviceNode");
+    ScopedMemoryDebugAnnotation op_annotation(
+        "eager::CopyToDeviceNode", "dynamic", tensor.dtype(), &tensor.shape());
    TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &tensor));
    if (!async_ && mirror_) {
      return dst_->AddLocalMirror(std::move(tensor), dstd_);
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@ -374,7 +374,10 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
 //    running without an explicitly requested device.
 Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
                         int* num_retvals) {
-  auto op_annotation = ScopedMemoryDebugAnnotation(op->op_name());
+  ScopedMemoryDebugAnnotation op_annotation(
+      op->op_name(), op->remote_func_params().has_value()
+                         ? op->remote_func_params().value().step_id.value_or(0)
+                         : 0);
  profiler::TraceMe activity(
      [&] { return absl::StrCat("EagerLocalExecute: ", op->Name()); },
      profiler::TraceMeLevel::kInfo);
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -340,7 +340,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
  if (!scratch_) {
    DCHECK(stream_);
    size_t scratch_buffer_size = Eigen::kGpuScratchSize + sizeof(unsigned int);
-    auto op_annotation = ScopedMemoryDebugAnnotation("ScratchBuffer");
+    ScopedMemoryDebugAnnotation op_annotation("ScratchBuffer");
    void* scratch_buffer = gpu_allocator_->AllocateRaw(
        Allocator::kAllocatorAlignment, scratch_buffer_size);
    if (scratch_buffer == nullptr) {
@ -498,8 +498,8 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
    }
  }
  ScopedActivateExecutorContext scoped_activation{stream->parent()};
-  auto op_annotation = ScopedMemoryDebugAnnotation(
-      op_kernel->name_view().data(), context->step_id());
+  ScopedMemoryDebugAnnotation op_annotation(op_kernel->name_view().data(),
+                                            context->step_id());
  op_kernel->Compute(context);
  if (context->status().ok()) {
    if (sync_every_op_) {
@ -612,8 +612,6 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
 Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                          const AllocatorAttributes alloc_attrs,
                                          Tensor* tensor) {
-  auto op_annotation = ScopedMemoryDebugAnnotation(
-      (pending_op_name != nullptr ? pending_op_name : "MakeTensorFromProto"));
  AllocatorAttributes attr;
  attr.set_on_host(true);
  attr.set_gpu_compatible(true);
@ -624,6 +622,8 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                   tensor_proto.DebugString());
  }

+  ScopedMemoryDebugAnnotation op_annotation("MakeTensorFromProto", "dynamic",
+                                            parsed.dtype(), &parsed.shape());
  if (parsed.dtype() == DT_VARIANT) {
    const Variant* from = parsed.flat<Variant>().data();
    int numa_node = attributes().locality().numa_node();
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@ -409,8 +409,9 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
                                               int src_rank,
                                               const Tensor* src_tensor,
                                               const StatusCallback& done) {
-  auto op_annotation = ScopedMemoryDebugAnnotation(
-      col_ctx_->op_ctx->op_kernel().name_view().data());
+  ScopedMemoryDebugAnnotation op_annotation(
+      col_ctx_->op_ctx->op_kernel().name_view().data(), col_ctx_->step_id,
+      "dynamic", src_tensor->dtype(), &src_tensor->shape());
  string send_buf_key =
      BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
  int dst_idx =
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@ -74,7 +74,8 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
    return;
  }

-  auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone");
+  ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", 0, "dynamic",
+                                            in.dtype(), &in.shape());
  AllocatorAttributes attr = recv_args.alloc_attrs;
  attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
                          recv_args.alloc_attrs.gpu_compatible());
@ -112,7 +113,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
                               RendezvousInterface::DoneCallback done) {
  VLOG(1) << "IntraProcessRendezvous Recv " << local << " " << parsed.FullKey();

-  auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync");
+  ScopedMemoryDebugAnnotation op_annotation("RecvAsync");
  // Recv the tensor from local_.
  local->RecvAsync(
      parsed, recv_args,
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@ -272,9 +272,8 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
    return;
  }

-  // Note that it would be nice to cache the step_id here, but it's not
-  // available.
-  auto op_annotation = ScopedMemoryDebugAnnotation("SameWorkerRecvDone", 0);
+  ScopedMemoryDebugAnnotation op_annotation("SameWorkerRecvDone", step_id_,
+                                            "dynamic", in.dtype(), &in.shape());
  AllocatorAttributes attr = recv_args.alloc_attrs;
  attr.set_gpu_compatible(send_args.alloc_attrs.gpu_compatible() ||
                          recv_args.alloc_attrs.gpu_compatible());
@ -323,7 +322,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
  DCHECK(is_initialized()) << "RecvAsync called when uninitialized (key: "
                           << parsed.FullKey() << ").";

-  auto op_annotation = ScopedMemoryDebugAnnotation("RecvAsync", 0);
+  ScopedMemoryDebugAnnotation op_annotation("RecvAsync", step_id_);
  // Are src and dst in the same worker?
  if (IsSameWorker(parsed.src, parsed.dst)) {
    // Recv the tensor from local_.
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@ -129,9 +129,10 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
        }
        AllocatorAttributes cpu_attr;
        cpu_attr.set_gpu_compatible(true);
-        auto op_annotation = ScopedMemoryDebugAnnotation(
+        ScopedMemoryDebugAnnotation op_annotation(
            "CollectiveRemoteAccessDistributed::RecvFromPeer"
-            "::recv_buf_callback");
+            "::recv_buf_callback",
+            step_id_, "dynamic", to_tensor->dtype(), &to_tensor->shape());
        Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
                                        to_tensor->dtype(), to_tensor->shape());
        PopulateTensorFromExtra(extra, cpu_tensor);
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@ -669,8 +669,9 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
          AllocatorAttributes cpu_attr;
          cpu_attr.set_gpu_compatible(true);
          cpu_attr.set_nic_compatible(true);
-          auto op_annotation = ScopedMemoryDebugAnnotation(
-              "GrpcWorker::RecvBufAsync::consumer_callback");
+          ScopedMemoryDebugAnnotation op_annotation(
+              "GrpcWorker::RecvBufAsync::consumer_callback", request->step_id(),
+              "dynamic", hook->prod_value->dtype(), &hook->prod_value->shape());
          Tensor* cpu_tensor =
              new Tensor(cpu_dev->GetAllocator(cpu_attr),
                         hook->prod_value->dtype(), hook->prod_value->shape());
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@ -27,8 +27,7 @@ limitations under the License.

 namespace tensorflow {

-thread_local const char* pending_op_name = nullptr;
-thread_local int64 pending_step_id = 0;
+thread_local MemoryDebugAnnotation ScopedMemoryDebugAnnotation::annotation_;

 string AllocatorStats::DebugString() const {
  return strings::Printf(
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -32,6 +32,8 @@ limitations under the License.

 namespace tensorflow {

+class TensorShape;
+
 // Attributes for a single allocation call. Different calls to the same
 // allocator could potentially have different allocation attributes.
 struct AllocationAttributes {
@ -62,31 +64,80 @@ struct AllocationAttributes {
  TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes);
 };

-// The runtime will cache Op names in thread-local memory and some allocators
-// will try to tag allocations with the requesting Op.
-extern thread_local const char* pending_op_name;
-extern thread_local int64 pending_step_id;
+// Annotations for memory profiling and debugging purpose. The runtime will
+// cache the annotations in thread-local memory, and some allocators will try to
+// tag allocations with the annotations.
+struct MemoryDebugAnnotation {
+  const char* pending_op_name = nullptr;
+  int64 pending_step_id = 0;
+  const char* pending_region_type = nullptr;
+  int32 pending_data_type = 0;
+  const TensorShape* pending_shape = nullptr;
+};

-// Wrapper class of pending_op_name and pending_step_id for RAII.
+// Wrapper class of MemoryDebugAnnotation for RAII.
 class ScopedMemoryDebugAnnotation {
 public:
+  static const MemoryDebugAnnotation& CurrentAnnotation() {
+    return annotation_;
+  }
+
  explicit ScopedMemoryDebugAnnotation(const char* op_name) {
-    last_op_name_ = pending_op_name;
-    pending_op_name = op_name;
+    last_annotation_ = annotation_;
+    CleanupAnnotation();
+    annotation_.pending_op_name = op_name;
  }

  explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id) {
-    last_op_name_ = pending_op_name;
-    pending_op_name = op_name;
-    pending_step_id = step_id;
+    last_annotation_ = annotation_;
+    CleanupAnnotation();
+    annotation_.pending_op_name = op_name;
+    annotation_.pending_step_id = step_id;
  }

-  ~ScopedMemoryDebugAnnotation() { pending_op_name = last_op_name_; }
+  // This constructor keeps the pending_op_name and pending_step_id from parent
+  // (if any).  Otherwise it overwrites with op_name.
+  explicit ScopedMemoryDebugAnnotation(const char* op_name,
+                                       const char* region_type, int32 data_type,
+                                       const TensorShape* shape) {
+    last_annotation_ = annotation_;
+    if (!annotation_.pending_op_name) {
+      annotation_.pending_op_name = op_name;
+    }
+    annotation_.pending_region_type = region_type;
+    annotation_.pending_data_type = data_type;
+    annotation_.pending_shape = shape;
+  }
+
+  explicit ScopedMemoryDebugAnnotation(const char* op_name, int64 step_id,
+                                       const char* region_type, int32 data_type,
+                                       const TensorShape* shape) {
+    last_annotation_ = annotation_;
+    annotation_.pending_op_name = op_name;
+    annotation_.pending_step_id = step_id;
+    annotation_.pending_region_type = region_type;
+    annotation_.pending_data_type = data_type;
+    annotation_.pending_shape = shape;
+  }
+
+  ~ScopedMemoryDebugAnnotation() { annotation_ = last_annotation_; }

 private:
-  // Stores the previous value of pending_op_name in case the annotations are
-  // nested.
-  const char* last_op_name_ = nullptr;
+  void CleanupAnnotation() {
+    annotation_.pending_op_name = nullptr;
+    annotation_.pending_step_id = 0;
+    annotation_.pending_region_type = nullptr;
+    annotation_.pending_data_type = 0;
+    annotation_.pending_shape = nullptr;
+  }
+
+  // Stores the current annotations.
+  static thread_local MemoryDebugAnnotation annotation_;
+
+  // Stores the previous values in case the annotations are nested.
+  MemoryDebugAnnotation last_annotation_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedMemoryDebugAnnotation);
 };

 // Runtime statistics collected by an allocator. Exactly the same as
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@ -703,8 +703,6 @@ Status OpKernelContext::allocate_tensor(
    DataType type, const TensorShape& shape, Tensor* out_tensor,
    AllocatorAttributes attr, const AllocationAttributes& allocation_attr) {
  Allocator* a = get_allocator(attr);
-  auto op_annotation =
-      ScopedMemoryDebugAnnotation(op_kernel().name_view().data(), step_id());
  Tensor new_tensor(a, type, shape,
                    AllocationAttributes(allocation_attr.no_retry_on_failure,
                                         /* allocation_will_be_logged= */ true,
@ -758,6 +756,8 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
          " more than once.  Try turning off the ScopedAllocator optimizer.");
    }
  }
+  ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
+                                            step_id(), "output", type, &shape);
  auto output_tensor = MakeUnique<Tensor>();
  Status s = allocate_tensor(type, shape, output_tensor.get(), attr);
  if (s.ok()) {
@ -787,6 +787,8 @@ Status OpKernelContext::allocate_temp(
            << ".  Switch to allocate_output to avoid performance penalty.";
    allocator_attr.scope_id = -1;
  }
+  ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
+                                            step_id(), "temp", type, &shape);
  Status s =
      allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
  if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
@ -815,6 +817,8 @@ Status OpKernelContext::allocate_persistent(DataType type,
    return errors::Internal(
        "Unexpected call to allocate_persistent with scope_id ", attr.scope_id);
  }
+  ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
+                                            step_id(), "persist", type, &shape);
  Tensor persistent;
  Status s = allocate_tensor(type, shape, &persistent, attr);
  if (s.ok()) {
@ -921,6 +925,9 @@ bool OpKernelContext::maybe_set_output_by_allocate_and_copy(
            << " params_->forward_from_array[index] "
            << params_->forward_from_array[index] << " alloc_attr.scope_id "
            << output_alloc_attr(index).scope_id;
+    ScopedMemoryDebugAnnotation op_annotation(op_kernel().name_view().data(),
+                                              step_id(), "output",
+                                              tensor.dtype(), &tensor.shape());
    auto new_tensor = MakeUnique<Tensor>();
    Status s = allocate_tensor(tensor.dtype(), tensor.shape(), new_tensor.get(),
                               output_alloc_attr(index));
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@ -73,7 +73,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
    : OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false),
      tensor_(ctx->output_type(0)) {
  const TensorProto* proto = nullptr;
-  auto op_annotation = ScopedMemoryDebugAnnotation(name_view().data());
+  ScopedMemoryDebugAnnotation op_annotation(name_view().data());
  OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
  OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
                          *proto, AllocatorAttributes(), &tensor_));