diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD index 89a4f27f484..e73d6852fd1 100644 --- a/tensorflow/core/common_runtime/BUILD +++ b/tensorflow/core/common_runtime/BUILD @@ -2718,3 +2718,15 @@ tf_cc_test( "//third_party/eigen3", ], ) + +tf_cc_test( + name = "bfc_allocator_test", + srcs = ["bfc_allocator_test.cc"], + deps = [ + ":bfc_allocator", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core/framework:allocator", + "//tensorflow/core/platform:test_benchmark", + ], +) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 1281a54ea0f..3551472e9c6 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -26,9 +26,7 @@ limitations under the License. #include "tensorflow/core/platform/file_system.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/mutex.h" -#ifdef TENSORFLOW_MEM_DEBUG #include "tensorflow/core/platform/stacktrace.h" -#endif #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/profiler/lib/traceme.h" @@ -566,27 +564,22 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); stats_.largest_alloc_size = std::max(stats_.largest_alloc_size, chunk->size); - -#ifdef TENSORFLOW_MEM_DEBUG if (ShouldRecordOpName()) { const auto& annotation = ScopedMemoryDebugAnnotation::CurrentAnnotation(); - if (annotation.pending_op_name != nullptr) { - chunk->op_name = annotation.pending_op_name; - } else { - LOG(INFO) << "missing pending_op_name for " << Name() - << " reading addr " - << static_cast(&annotation.pending_op_name) - << "\n" - << CurrentStackTrace(); - chunk->op_name = nullptr; + chunk->op_name = annotation.pending_op_name; + if (!annotation.pending_op_name) { + VLOG(2) << "missing pending_op_name for " << Name() + << " reading addr " + << static_cast(&annotation.pending_op_name) + << "\n" + << CurrentStackTrace(); } - chunk->action_count = ++action_counter_; chunk->step_id = annotation.pending_step_id; - int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; + chunk->action_count = ++action_counter_; + uint64 slot = chunk->action_count % kMemDebugHistorySize; size_history_[slot] = stats_.bytes_in_use; } -#endif VLOG(4) << "Returning: " << chunk->ptr; if (VLOG_IS_ON(4)) { @@ -764,13 +757,11 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) { // Updates the stats. stats_.bytes_in_use -= c->size; -#ifdef TENSORFLOW_MEM_DEBUG if (ShouldRecordOpName()) { c->action_count = ++action_counter_; - int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; + uint64 slot = c->action_count % kMemDebugHistorySize; size_history_[slot] = stats_.bytes_in_use; } -#endif } BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h, @@ -1039,12 +1030,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) { string buf = strings::StrCat( (c->in_use() ? "InUse" : "Free "), " at ", strings::Hex(reinterpret_cast(c->ptr)), " of size ", c->size); -#ifdef TENSORFLOW_MEM_DEBUG if (ShouldRecordOpName()) { - strings::StrAppend(&buf, " by op ", c->op_name, " action_count ", - c->action_count, " step ", c->step_id); + strings::StrAppend( + &buf, " by op ", c->op_name ? string(c->op_name) : "UNKNOWN", + " action_count ", c->action_count, " step ", c->step_id); } -#endif strings::StrAppend(&buf, " next ", c->next); if (timing_counter_) { strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count); @@ -1134,11 +1124,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { mc->set_size(c->size); mc->set_requested_size(c->requested_size); mc->set_bin(c->bin_num); -#ifdef TENSORFLOW_MEM_DEBUG mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN"); mc->set_step_id(c->step_id); mc->set_action_count(c->action_count); -#endif if (timing_counter_) { mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count); } @@ -1148,17 +1136,14 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { mas->set_fragmentation_metric(GetFragmentation()); -#ifdef TENSORFLOW_MEM_DEBUG // Record the recent size history - int history_len = static_cast(std::min( - action_counter_, static_cast(MEM_DEBUG_SIZE_HISTORY_SIZE))); - for (int i = action_counter_ - history_len; i < action_counter_; ++i) { + uint64 history_len = std::min(action_counter_, kMemDebugHistorySize); + for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) { SnapShot* ss = md.add_snap_shot(); ss->set_action_count(i); - int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE; + uint64 slot = i % kMemDebugHistorySize; ss->set_size(size_history_[slot]); } -#endif return md; } diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 25e97311a0f..d4ebd6ffaa6 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -188,12 +188,10 @@ class BFCAllocator : public Allocator { bool in_use() const { return allocation_id != -1; } -#ifdef TENSORFLOW_MEM_DEBUG // optional debugging info const char* op_name = nullptr; uint64 step_id = 0; - int64 action_count = 0; -#endif + uint64 action_count = 0; string DebugString(BFCAllocator* a, bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS { @@ -210,11 +208,9 @@ class BFCAllocator : public Allocator { Chunk* n = a->ChunkFromHandle(next); strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); } -#ifdef TENSORFLOW_MEM_DEBUG strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN", ", stepid: ", step_id, ", last_action: ", action_count); -#endif return dbg; } }; @@ -593,11 +589,11 @@ class BFCAllocator : public Allocator { // Stats. AllocatorStats stats_ TF_GUARDED_BY(lock_); -#ifdef TENSORFLOW_MEM_DEBUG - int64 action_counter_ TF_GUARDED_BY(lock_); -#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096 - int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE]; -#endif + uint64 action_counter_ TF_GUARDED_BY(lock_); + + // The circular buffer used to track memory operation history. + static constexpr uint64 kMemDebugHistorySize = 4096; + int64 size_history_[kMemDebugHistorySize]; friend class GPUBFCAllocatorPrivateMethodsTest; friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific; diff --git a/tensorflow/core/common_runtime/bfc_allocator_test.cc b/tensorflow/core/common_runtime/bfc_allocator_test.cc new file mode 100644 index 00000000000..72bdf4cea8b --- /dev/null +++ b/tensorflow/core/common_runtime/bfc_allocator_test.cc @@ -0,0 +1,87 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/common_runtime/bfc_allocator.h" + +#include +#include + +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { + +// A fake SubAllocator to test the performance of BFCAllocator. +class FakeSubAllocator : public SubAllocator { + public: + FakeSubAllocator() : SubAllocator({}, {}), alloc_counter_(0) {} + ~FakeSubAllocator() override {} + + // Alloc and Free functions are implemented as very cheap operations, so that + // the benchmark can focus on the performance of BFCAllocator itself. + void* Alloc(size_t alignment, size_t num_bytes, + size_t* bytes_received) override { + *bytes_received = num_bytes; + return reinterpret_cast(alloc_counter_++); + } + + void Free(void* ptr, size_t num_bytes) override {} + + bool SupportsCoalescing() const override { return false; } + + private: + int64 alloc_counter_; +}; + +void BM_Allocator(::testing::benchmark::State& state) { + constexpr int kAllocSize = 1 << 14; + const int kLongLivedObjects = state.range(0); + const int kShortLivedObjects = state.range(1); + + FakeSubAllocator* sub_allocator = new FakeSubAllocator; + BFCAllocator bfc_allocator(sub_allocator, 1 << 30, false, "GPU_0_bfc"); + + string test_op_name = "test_op"; + ScopedMemoryDebugAnnotation annotation(test_op_name.data()); + + // Allocate long lived objects. + std::vector long_lived(kLongLivedObjects); + for (int i = 0; i < kLongLivedObjects; i++) { + long_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize); + } + std::vector deallocation_order(kShortLivedObjects); + for (int i = 0; i < kShortLivedObjects; i++) { + deallocation_order[i] = i; + } + std::shuffle(deallocation_order.begin(), deallocation_order.end(), + std::default_random_engine(0)); + + // Allocate and deallocate short lived objects. + std::vector short_lived(kShortLivedObjects); + for (auto _ : state) { + for (int i = 0; i < kShortLivedObjects; i++) { + short_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize); + } + for (int i = 0; i < kShortLivedObjects; i++) { + bfc_allocator.DeallocateRaw(short_lived[deallocation_order[i]]); + } + } +} +BENCHMARK(BM_Allocator) + ->ArgPair(0, 256) + ->ArgPair(1000, 256) + ->ArgPair(10000, 256); + +} // namespace tensorflow