Always record memory debug info like op_name and step_id in BFCAllocator Chunk. Remove flag TENSORFLOW_MEM_DEBUG and MEM_DEBUG_SIZE_HISTORY_SIZE because the debug information is initialized by default.

PiperOrigin-RevId: 355658019 Change-Id: I9df55244bc269e701a26ced5a83ea1783596b252
2021-02-04 10:29:36 -08:00 · 2021-02-04 10:29:36 -08:00 · 7ed3687495
commit 7ed3687495
parent a58d44afc8
4 changed files with 121 additions and 41 deletions
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@ -2718,3 +2718,15 @@ tf_cc_test(
        "//third_party/eigen3",
    ],
 )
+
+tf_cc_test(
+    name = "bfc_allocator_test",
+    srcs = ["bfc_allocator_test.cc"],
+    deps = [
+        ":bfc_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:test_benchmark",
+    ],
+)
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -26,9 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#ifdef TENSORFLOW_MEM_DEBUG
 #include "tensorflow/core/platform/stacktrace.h"
-#endif
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@ -566,27 +564,22 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
            std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
        stats_.largest_alloc_size =
            std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
-
-#ifdef TENSORFLOW_MEM_DEBUG
        if (ShouldRecordOpName()) {
          const auto& annotation =
              ScopedMemoryDebugAnnotation::CurrentAnnotation();
-          if (annotation.pending_op_name != nullptr) {
-            chunk->op_name = annotation.pending_op_name;
-          } else {
-            LOG(INFO) << "missing pending_op_name for " << Name()
-                      << " reading addr "
-                      << static_cast<const void*>(&annotation.pending_op_name)
-                      << "\n"
-                      << CurrentStackTrace();
-            chunk->op_name = nullptr;
+          chunk->op_name = annotation.pending_op_name;
+          if (!annotation.pending_op_name) {
+            VLOG(2) << "missing pending_op_name for " << Name()
+                    << " reading addr "
+                    << static_cast<const void*>(&annotation.pending_op_name)
+                    << "\n"
+                    << CurrentStackTrace();
          }
-          chunk->action_count = ++action_counter_;
          chunk->step_id = annotation.pending_step_id;
-          int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
+          chunk->action_count = ++action_counter_;
+          uint64 slot = chunk->action_count % kMemDebugHistorySize;
          size_history_[slot] = stats_.bytes_in_use;
        }
-#endif

        VLOG(4) << "Returning: " << chunk->ptr;
        if (VLOG_IS_ON(4)) {
@ -764,13 +757,11 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
  // Updates the stats.
  stats_.bytes_in_use -= c->size;

-#ifdef TENSORFLOW_MEM_DEBUG
  if (ShouldRecordOpName()) {
    c->action_count = ++action_counter_;
-    int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
+    uint64 slot = c->action_count % kMemDebugHistorySize;
    size_history_[slot] = stats_.bytes_in_use;
  }
-#endif
 }

 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
@ -1039,12 +1030,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
      string buf = strings::StrCat(
          (c->in_use() ? "InUse" : "Free "), " at ",
          strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
-#ifdef TENSORFLOW_MEM_DEBUG
      if (ShouldRecordOpName()) {
-        strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
-                           c->action_count, " step ", c->step_id);
+        strings::StrAppend(
+            &buf, " by op ", c->op_name ? string(c->op_name) : "UNKNOWN",
+            " action_count ", c->action_count, " step ", c->step_id);
      }
-#endif
      strings::StrAppend(&buf, " next ", c->next);
      if (timing_counter_) {
        strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
@ -1134,11 +1124,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
      mc->set_size(c->size);
      mc->set_requested_size(c->requested_size);
      mc->set_bin(c->bin_num);
-#ifdef TENSORFLOW_MEM_DEBUG
      mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
      mc->set_step_id(c->step_id);
      mc->set_action_count(c->action_count);
-#endif
      if (timing_counter_) {
        mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
      }
@ -1148,17 +1136,14 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {

  mas->set_fragmentation_metric(GetFragmentation());

-#ifdef TENSORFLOW_MEM_DEBUG
  // Record the recent size history
-  int history_len = static_cast<int>(std::min(
-      action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
-  for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
+  uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
+  for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
    SnapShot* ss = md.add_snap_shot();
    ss->set_action_count(i);
-    int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
+    uint64 slot = i % kMemDebugHistorySize;
    ss->set_size(size_history_[slot]);
  }
-#endif

  return md;
 }
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -188,12 +188,10 @@ class BFCAllocator : public Allocator {

    bool in_use() const { return allocation_id != -1; }

-#ifdef TENSORFLOW_MEM_DEBUG
    // optional debugging info
    const char* op_name = nullptr;
    uint64 step_id = 0;
-    int64 action_count = 0;
-#endif
+    uint64 action_count = 0;

    string DebugString(BFCAllocator* a,
                       bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
@ -210,11 +208,9 @@ class BFCAllocator : public Allocator {
        Chunk* n = a->ChunkFromHandle(next);
        strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
      }
-#ifdef TENSORFLOW_MEM_DEBUG
      strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
                         ", stepid: ", step_id,
                         ", last_action: ", action_count);
-#endif
      return dbg;
    }
  };
@ -593,11 +589,11 @@ class BFCAllocator : public Allocator {

  // Stats.
  AllocatorStats stats_ TF_GUARDED_BY(lock_);
-#ifdef TENSORFLOW_MEM_DEBUG
-  int64 action_counter_ TF_GUARDED_BY(lock_);
-#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
-  int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
-#endif
+  uint64 action_counter_ TF_GUARDED_BY(lock_);
+
+  // The circular buffer used to track memory operation history.
+  static constexpr uint64 kMemDebugHistorySize = 4096;
+  int64 size_history_[kMemDebugHistorySize];

  friend class GPUBFCAllocatorPrivateMethodsTest;
  friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
--- a/tensorflow/core/common_runtime/bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator_test.cc
@ -0,0 +1,87 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+
+#include <algorithm>
+#include <random>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+// A fake SubAllocator to test the performance of BFCAllocator.
+class FakeSubAllocator : public SubAllocator {
+ public:
+  FakeSubAllocator() : SubAllocator({}, {}), alloc_counter_(0) {}
+  ~FakeSubAllocator() override {}
+
+  // Alloc and Free functions are implemented as very cheap operations, so that
+  // the benchmark can focus on the performance of BFCAllocator itself.
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override {
+    *bytes_received = num_bytes;
+    return reinterpret_cast<void*>(alloc_counter_++);
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {}
+
+  bool SupportsCoalescing() const override { return false; }
+
+ private:
+  int64 alloc_counter_;
+};
+
+void BM_Allocator(::testing::benchmark::State& state) {
+  constexpr int kAllocSize = 1 << 14;
+  const int kLongLivedObjects = state.range(0);
+  const int kShortLivedObjects = state.range(1);
+
+  FakeSubAllocator* sub_allocator = new FakeSubAllocator;
+  BFCAllocator bfc_allocator(sub_allocator, 1 << 30, false, "GPU_0_bfc");
+
+  string test_op_name = "test_op";
+  ScopedMemoryDebugAnnotation annotation(test_op_name.data());
+
+  // Allocate long lived objects.
+  std::vector<void*> long_lived(kLongLivedObjects);
+  for (int i = 0; i < kLongLivedObjects; i++) {
+    long_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
+  }
+  std::vector<int> deallocation_order(kShortLivedObjects);
+  for (int i = 0; i < kShortLivedObjects; i++) {
+    deallocation_order[i] = i;
+  }
+  std::shuffle(deallocation_order.begin(), deallocation_order.end(),
+               std::default_random_engine(0));
+
+  // Allocate and deallocate short lived objects.
+  std::vector<void*> short_lived(kShortLivedObjects);
+  for (auto _ : state) {
+    for (int i = 0; i < kShortLivedObjects; i++) {
+      short_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
+    }
+    for (int i = 0; i < kShortLivedObjects; i++) {
+      bfc_allocator.DeallocateRaw(short_lived[deallocation_order[i]]);
+    }
+  }
+}
+BENCHMARK(BM_Allocator)
+    ->ArgPair(0, 256)
+    ->ArgPair(1000, 256)
+    ->ArgPair(10000, 256);
+
+}  // namespace tensorflow