Always record memory debug info like op_name and step_id in BFCAllocator Chunk. Remove flag TENSORFLOW_MEM_DEBUG and MEM_DEBUG_SIZE_HISTORY_SIZE because the debug information is initialized by default.

PiperOrigin-RevId: 355658019
Change-Id: I9df55244bc269e701a26ced5a83ea1783596b252
This commit is contained in:
Tianrun Li 2021-02-04 10:29:36 -08:00 committed by TensorFlower Gardener
parent a58d44afc8
commit 7ed3687495
4 changed files with 121 additions and 41 deletions

View File

@ -2718,3 +2718,15 @@ tf_cc_test(
"//third_party/eigen3",
],
)
tf_cc_test(
name = "bfc_allocator_test",
srcs = ["bfc_allocator_test.cc"],
deps = [
":bfc_allocator",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core/framework:allocator",
"//tensorflow/core/platform:test_benchmark",
],
)

View File

@ -26,9 +26,7 @@ limitations under the License.
#include "tensorflow/core/platform/file_system.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mutex.h"
#ifdef TENSORFLOW_MEM_DEBUG
#include "tensorflow/core/platform/stacktrace.h"
#endif
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/lib/traceme.h"
@ -566,27 +564,22 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
stats_.largest_alloc_size =
std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
#ifdef TENSORFLOW_MEM_DEBUG
if (ShouldRecordOpName()) {
const auto& annotation =
ScopedMemoryDebugAnnotation::CurrentAnnotation();
if (annotation.pending_op_name != nullptr) {
chunk->op_name = annotation.pending_op_name;
} else {
LOG(INFO) << "missing pending_op_name for " << Name()
<< " reading addr "
<< static_cast<const void*>(&annotation.pending_op_name)
<< "\n"
<< CurrentStackTrace();
chunk->op_name = nullptr;
chunk->op_name = annotation.pending_op_name;
if (!annotation.pending_op_name) {
VLOG(2) << "missing pending_op_name for " << Name()
<< " reading addr "
<< static_cast<const void*>(&annotation.pending_op_name)
<< "\n"
<< CurrentStackTrace();
}
chunk->action_count = ++action_counter_;
chunk->step_id = annotation.pending_step_id;
int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
chunk->action_count = ++action_counter_;
uint64 slot = chunk->action_count % kMemDebugHistorySize;
size_history_[slot] = stats_.bytes_in_use;
}
#endif
VLOG(4) << "Returning: " << chunk->ptr;
if (VLOG_IS_ON(4)) {
@ -764,13 +757,11 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
// Updates the stats.
stats_.bytes_in_use -= c->size;
#ifdef TENSORFLOW_MEM_DEBUG
if (ShouldRecordOpName()) {
c->action_count = ++action_counter_;
int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
uint64 slot = c->action_count % kMemDebugHistorySize;
size_history_[slot] = stats_.bytes_in_use;
}
#endif
}
BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
@ -1039,12 +1030,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
string buf = strings::StrCat(
(c->in_use() ? "InUse" : "Free "), " at ",
strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
#ifdef TENSORFLOW_MEM_DEBUG
if (ShouldRecordOpName()) {
strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
c->action_count, " step ", c->step_id);
strings::StrAppend(
&buf, " by op ", c->op_name ? string(c->op_name) : "UNKNOWN",
" action_count ", c->action_count, " step ", c->step_id);
}
#endif
strings::StrAppend(&buf, " next ", c->next);
if (timing_counter_) {
strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
@ -1134,11 +1124,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
mc->set_size(c->size);
mc->set_requested_size(c->requested_size);
mc->set_bin(c->bin_num);
#ifdef TENSORFLOW_MEM_DEBUG
mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
mc->set_step_id(c->step_id);
mc->set_action_count(c->action_count);
#endif
if (timing_counter_) {
mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
}
@ -1148,17 +1136,14 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
mas->set_fragmentation_metric(GetFragmentation());
#ifdef TENSORFLOW_MEM_DEBUG
// Record the recent size history
int history_len = static_cast<int>(std::min(
action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
SnapShot* ss = md.add_snap_shot();
ss->set_action_count(i);
int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
uint64 slot = i % kMemDebugHistorySize;
ss->set_size(size_history_[slot]);
}
#endif
return md;
}

View File

@ -188,12 +188,10 @@ class BFCAllocator : public Allocator {
bool in_use() const { return allocation_id != -1; }
#ifdef TENSORFLOW_MEM_DEBUG
// optional debugging info
const char* op_name = nullptr;
uint64 step_id = 0;
int64 action_count = 0;
#endif
uint64 action_count = 0;
string DebugString(BFCAllocator* a,
bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
@ -210,11 +208,9 @@ class BFCAllocator : public Allocator {
Chunk* n = a->ChunkFromHandle(next);
strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
}
#ifdef TENSORFLOW_MEM_DEBUG
strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
", stepid: ", step_id,
", last_action: ", action_count);
#endif
return dbg;
}
};
@ -593,11 +589,11 @@ class BFCAllocator : public Allocator {
// Stats.
AllocatorStats stats_ TF_GUARDED_BY(lock_);
#ifdef TENSORFLOW_MEM_DEBUG
int64 action_counter_ TF_GUARDED_BY(lock_);
#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
#endif
uint64 action_counter_ TF_GUARDED_BY(lock_);
// The circular buffer used to track memory operation history.
static constexpr uint64 kMemDebugHistorySize = 4096;
int64 size_history_[kMemDebugHistorySize];
friend class GPUBFCAllocatorPrivateMethodsTest;
friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;

View File

@ -0,0 +1,87 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include <algorithm>
#include <random>
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
// A fake SubAllocator to test the performance of BFCAllocator.
class FakeSubAllocator : public SubAllocator {
public:
FakeSubAllocator() : SubAllocator({}, {}), alloc_counter_(0) {}
~FakeSubAllocator() override {}
// Alloc and Free functions are implemented as very cheap operations, so that
// the benchmark can focus on the performance of BFCAllocator itself.
void* Alloc(size_t alignment, size_t num_bytes,
size_t* bytes_received) override {
*bytes_received = num_bytes;
return reinterpret_cast<void*>(alloc_counter_++);
}
void Free(void* ptr, size_t num_bytes) override {}
bool SupportsCoalescing() const override { return false; }
private:
int64 alloc_counter_;
};
void BM_Allocator(::testing::benchmark::State& state) {
constexpr int kAllocSize = 1 << 14;
const int kLongLivedObjects = state.range(0);
const int kShortLivedObjects = state.range(1);
FakeSubAllocator* sub_allocator = new FakeSubAllocator;
BFCAllocator bfc_allocator(sub_allocator, 1 << 30, false, "GPU_0_bfc");
string test_op_name = "test_op";
ScopedMemoryDebugAnnotation annotation(test_op_name.data());
// Allocate long lived objects.
std::vector<void*> long_lived(kLongLivedObjects);
for (int i = 0; i < kLongLivedObjects; i++) {
long_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
}
std::vector<int> deallocation_order(kShortLivedObjects);
for (int i = 0; i < kShortLivedObjects; i++) {
deallocation_order[i] = i;
}
std::shuffle(deallocation_order.begin(), deallocation_order.end(),
std::default_random_engine(0));
// Allocate and deallocate short lived objects.
std::vector<void*> short_lived(kShortLivedObjects);
for (auto _ : state) {
for (int i = 0; i < kShortLivedObjects; i++) {
short_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
}
for (int i = 0; i < kShortLivedObjects; i++) {
bfc_allocator.DeallocateRaw(short_lived[deallocation_order[i]]);
}
}
}
BENCHMARK(BM_Allocator)
->ArgPair(0, 256)
->ArgPair(1000, 256)
->ArgPair(10000, 256);
} // namespace tensorflow