diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 89a4f27f484..e73d6852fd1 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2718,3 +2718,15 @@ tf_cc_test(
         "//third_party/eigen3",
     ],
 )
+
+tf_cc_test(
+    name = "bfc_allocator_test",
+    srcs = ["bfc_allocator_test.cc"],
+    deps = [
+        ":bfc_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:test_benchmark",
+    ],
+)
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 1281a54ea0f..3551472e9c6 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -26,9 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#ifdef TENSORFLOW_MEM_DEBUG
 #include "tensorflow/core/platform/stacktrace.h"
-#endif
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -566,27 +564,22 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
             std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
         stats_.largest_alloc_size =
             std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
-
-#ifdef TENSORFLOW_MEM_DEBUG
         if (ShouldRecordOpName()) {
           const auto& annotation =
               ScopedMemoryDebugAnnotation::CurrentAnnotation();
-          if (annotation.pending_op_name != nullptr) {
-            chunk->op_name = annotation.pending_op_name;
-          } else {
-            LOG(INFO) << "missing pending_op_name for " << Name()
-                      << " reading addr "
-                      << static_cast<const void*>(&annotation.pending_op_name)
-                      << "\n"
-                      << CurrentStackTrace();
-            chunk->op_name = nullptr;
+          chunk->op_name = annotation.pending_op_name;
+          if (!annotation.pending_op_name) {
+            VLOG(2) << "missing pending_op_name for " << Name()
+                    << " reading addr "
+                    << static_cast<const void*>(&annotation.pending_op_name)
+                    << "\n"
+                    << CurrentStackTrace();
           }
-          chunk->action_count = ++action_counter_;
           chunk->step_id = annotation.pending_step_id;
-          int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
+          chunk->action_count = ++action_counter_;
+          uint64 slot = chunk->action_count % kMemDebugHistorySize;
           size_history_[slot] = stats_.bytes_in_use;
         }
-#endif
 
         VLOG(4) << "Returning: " << chunk->ptr;
         if (VLOG_IS_ON(4)) {
@@ -764,13 +757,11 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
 
-#ifdef TENSORFLOW_MEM_DEBUG
   if (ShouldRecordOpName()) {
     c->action_count = ++action_counter_;
-    int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
+    uint64 slot = c->action_count % kMemDebugHistorySize;
     size_history_[slot] = stats_.bytes_in_use;
   }
-#endif
 }
 
 BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
@@ -1039,12 +1030,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
       string buf = strings::StrCat(
           (c->in_use() ? "InUse" : "Free "), " at ",
           strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
-#ifdef TENSORFLOW_MEM_DEBUG
       if (ShouldRecordOpName()) {
-        strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
-                           c->action_count, " step ", c->step_id);
+        strings::StrAppend(
+            &buf, " by op ", c->op_name ? string(c->op_name) : "UNKNOWN",
+            " action_count ", c->action_count, " step ", c->step_id);
       }
-#endif
       strings::StrAppend(&buf, " next ", c->next);
       if (timing_counter_) {
         strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
@@ -1134,11 +1124,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
       mc->set_size(c->size);
       mc->set_requested_size(c->requested_size);
       mc->set_bin(c->bin_num);
-#ifdef TENSORFLOW_MEM_DEBUG
       mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
       mc->set_step_id(c->step_id);
       mc->set_action_count(c->action_count);
-#endif
       if (timing_counter_) {
         mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
       }
@@ -1148,17 +1136,14 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
 
   mas->set_fragmentation_metric(GetFragmentation());
 
-#ifdef TENSORFLOW_MEM_DEBUG
   // Record the recent size history
-  int history_len = static_cast<int>(std::min(
-      action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
-  for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
+  uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
+  for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
     SnapShot* ss = md.add_snap_shot();
     ss->set_action_count(i);
-    int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
+    uint64 slot = i % kMemDebugHistorySize;
     ss->set_size(size_history_[slot]);
   }
-#endif
 
   return md;
 }
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 25e97311a0f..d4ebd6ffaa6 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -188,12 +188,10 @@ class BFCAllocator : public Allocator {
 
     bool in_use() const { return allocation_id != -1; }
 
-#ifdef TENSORFLOW_MEM_DEBUG
     // optional debugging info
     const char* op_name = nullptr;
     uint64 step_id = 0;
-    int64 action_count = 0;
-#endif
+    uint64 action_count = 0;
 
     string DebugString(BFCAllocator* a,
                        bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
@@ -210,11 +208,9 @@ class BFCAllocator : public Allocator {
         Chunk* n = a->ChunkFromHandle(next);
         strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
       }
-#ifdef TENSORFLOW_MEM_DEBUG
       strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
                          ", stepid: ", step_id,
                          ", last_action: ", action_count);
-#endif
       return dbg;
     }
   };
@@ -593,11 +589,11 @@ class BFCAllocator : public Allocator {
 
   // Stats.
   AllocatorStats stats_ TF_GUARDED_BY(lock_);
-#ifdef TENSORFLOW_MEM_DEBUG
-  int64 action_counter_ TF_GUARDED_BY(lock_);
-#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
-  int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
-#endif
+  uint64 action_counter_ TF_GUARDED_BY(lock_);
+
+  // The circular buffer used to track memory operation history.
+  static constexpr uint64 kMemDebugHistorySize = 4096;
+  int64 size_history_[kMemDebugHistorySize];
 
   friend class GPUBFCAllocatorPrivateMethodsTest;
   friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
diff --git a/tensorflow/core/common_runtime/bfc_allocator_test.cc b/tensorflow/core/common_runtime/bfc_allocator_test.cc
new file mode 100644
index 00000000000..72bdf4cea8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/bfc_allocator_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+
+#include <algorithm>
+#include <random>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+// A fake SubAllocator to test the performance of BFCAllocator.
+class FakeSubAllocator : public SubAllocator {
+ public:
+  FakeSubAllocator() : SubAllocator({}, {}), alloc_counter_(0) {}
+  ~FakeSubAllocator() override {}
+
+  // Alloc and Free functions are implemented as very cheap operations, so that
+  // the benchmark can focus on the performance of BFCAllocator itself.
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override {
+    *bytes_received = num_bytes;
+    return reinterpret_cast<void*>(alloc_counter_++);
+  }
+
+  void Free(void* ptr, size_t num_bytes) override {}
+
+  bool SupportsCoalescing() const override { return false; }
+
+ private:
+  int64 alloc_counter_;
+};
+
+void BM_Allocator(::testing::benchmark::State& state) {
+  constexpr int kAllocSize = 1 << 14;
+  const int kLongLivedObjects = state.range(0);
+  const int kShortLivedObjects = state.range(1);
+
+  FakeSubAllocator* sub_allocator = new FakeSubAllocator;
+  BFCAllocator bfc_allocator(sub_allocator, 1 << 30, false, "GPU_0_bfc");
+
+  string test_op_name = "test_op";
+  ScopedMemoryDebugAnnotation annotation(test_op_name.data());
+
+  // Allocate long lived objects.
+  std::vector<void*> long_lived(kLongLivedObjects);
+  for (int i = 0; i < kLongLivedObjects; i++) {
+    long_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
+  }
+  std::vector<int> deallocation_order(kShortLivedObjects);
+  for (int i = 0; i < kShortLivedObjects; i++) {
+    deallocation_order[i] = i;
+  }
+  std::shuffle(deallocation_order.begin(), deallocation_order.end(),
+               std::default_random_engine(0));
+
+  // Allocate and deallocate short lived objects.
+  std::vector<void*> short_lived(kShortLivedObjects);
+  for (auto _ : state) {
+    for (int i = 0; i < kShortLivedObjects; i++) {
+      short_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
+    }
+    for (int i = 0; i < kShortLivedObjects; i++) {
+      bfc_allocator.DeallocateRaw(short_lived[deallocation_order[i]]);
+    }
+  }
+}
+BENCHMARK(BM_Allocator)
+    ->ArgPair(0, 256)
+    ->ArgPair(1000, 256)
+    ->ArgPair(10000, 256);
+
+}  // namespace tensorflow