Always record memory debug info like op_name and step_id in BFCAllocator Chunk. Remove flag TENSORFLOW_MEM_DEBUG and MEM_DEBUG_SIZE_HISTORY_SIZE because the debug information is initialized by default.
PiperOrigin-RevId: 355658019 Change-Id: I9df55244bc269e701a26ced5a83ea1783596b252
This commit is contained in:
parent
a58d44afc8
commit
7ed3687495
tensorflow/core/common_runtime
@ -2718,3 +2718,15 @@ tf_cc_test(
|
||||
"//third_party/eigen3",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "bfc_allocator_test",
|
||||
srcs = ["bfc_allocator_test.cc"],
|
||||
deps = [
|
||||
":bfc_allocator",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core/framework:allocator",
|
||||
"//tensorflow/core/platform:test_benchmark",
|
||||
],
|
||||
)
|
||||
|
@ -26,9 +26,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/file_system.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
#include "tensorflow/core/platform/stacktrace.h"
|
||||
#endif
|
||||
#include "tensorflow/core/framework/tensor_shape.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
@ -566,27 +564,22 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||
std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
|
||||
stats_.largest_alloc_size =
|
||||
std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
if (ShouldRecordOpName()) {
|
||||
const auto& annotation =
|
||||
ScopedMemoryDebugAnnotation::CurrentAnnotation();
|
||||
if (annotation.pending_op_name != nullptr) {
|
||||
chunk->op_name = annotation.pending_op_name;
|
||||
} else {
|
||||
LOG(INFO) << "missing pending_op_name for " << Name()
|
||||
<< " reading addr "
|
||||
<< static_cast<const void*>(&annotation.pending_op_name)
|
||||
<< "\n"
|
||||
<< CurrentStackTrace();
|
||||
chunk->op_name = nullptr;
|
||||
chunk->op_name = annotation.pending_op_name;
|
||||
if (!annotation.pending_op_name) {
|
||||
VLOG(2) << "missing pending_op_name for " << Name()
|
||||
<< " reading addr "
|
||||
<< static_cast<const void*>(&annotation.pending_op_name)
|
||||
<< "\n"
|
||||
<< CurrentStackTrace();
|
||||
}
|
||||
chunk->action_count = ++action_counter_;
|
||||
chunk->step_id = annotation.pending_step_id;
|
||||
int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
|
||||
chunk->action_count = ++action_counter_;
|
||||
uint64 slot = chunk->action_count % kMemDebugHistorySize;
|
||||
size_history_[slot] = stats_.bytes_in_use;
|
||||
}
|
||||
#endif
|
||||
|
||||
VLOG(4) << "Returning: " << chunk->ptr;
|
||||
if (VLOG_IS_ON(4)) {
|
||||
@ -764,13 +757,11 @@ void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
|
||||
// Updates the stats.
|
||||
stats_.bytes_in_use -= c->size;
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
if (ShouldRecordOpName()) {
|
||||
c->action_count = ++action_counter_;
|
||||
int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
|
||||
uint64 slot = c->action_count % kMemDebugHistorySize;
|
||||
size_history_[slot] = stats_.bytes_in_use;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
|
||||
@ -1039,12 +1030,11 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
|
||||
string buf = strings::StrCat(
|
||||
(c->in_use() ? "InUse" : "Free "), " at ",
|
||||
strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
if (ShouldRecordOpName()) {
|
||||
strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
|
||||
c->action_count, " step ", c->step_id);
|
||||
strings::StrAppend(
|
||||
&buf, " by op ", c->op_name ? string(c->op_name) : "UNKNOWN",
|
||||
" action_count ", c->action_count, " step ", c->step_id);
|
||||
}
|
||||
#endif
|
||||
strings::StrAppend(&buf, " next ", c->next);
|
||||
if (timing_counter_) {
|
||||
strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
|
||||
@ -1134,11 +1124,9 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
||||
mc->set_size(c->size);
|
||||
mc->set_requested_size(c->requested_size);
|
||||
mc->set_bin(c->bin_num);
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
|
||||
mc->set_step_id(c->step_id);
|
||||
mc->set_action_count(c->action_count);
|
||||
#endif
|
||||
if (timing_counter_) {
|
||||
mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
|
||||
}
|
||||
@ -1148,17 +1136,14 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
||||
|
||||
mas->set_fragmentation_metric(GetFragmentation());
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
// Record the recent size history
|
||||
int history_len = static_cast<int>(std::min(
|
||||
action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
|
||||
for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
|
||||
uint64 history_len = std::min(action_counter_, kMemDebugHistorySize);
|
||||
for (uint64 i = action_counter_ - history_len; i < action_counter_; ++i) {
|
||||
SnapShot* ss = md.add_snap_shot();
|
||||
ss->set_action_count(i);
|
||||
int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
|
||||
uint64 slot = i % kMemDebugHistorySize;
|
||||
ss->set_size(size_history_[slot]);
|
||||
}
|
||||
#endif
|
||||
|
||||
return md;
|
||||
}
|
||||
|
@ -188,12 +188,10 @@ class BFCAllocator : public Allocator {
|
||||
|
||||
bool in_use() const { return allocation_id != -1; }
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
// optional debugging info
|
||||
const char* op_name = nullptr;
|
||||
uint64 step_id = 0;
|
||||
int64 action_count = 0;
|
||||
#endif
|
||||
uint64 action_count = 0;
|
||||
|
||||
string DebugString(BFCAllocator* a,
|
||||
bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS {
|
||||
@ -210,11 +208,9 @@ class BFCAllocator : public Allocator {
|
||||
Chunk* n = a->ChunkFromHandle(next);
|
||||
strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
|
||||
}
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
strings::StrAppend(&dbg, ", for: ", op_name ? op_name : "UNKNOWN",
|
||||
", stepid: ", step_id,
|
||||
", last_action: ", action_count);
|
||||
#endif
|
||||
return dbg;
|
||||
}
|
||||
};
|
||||
@ -593,11 +589,11 @@ class BFCAllocator : public Allocator {
|
||||
|
||||
// Stats.
|
||||
AllocatorStats stats_ TF_GUARDED_BY(lock_);
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
int64 action_counter_ TF_GUARDED_BY(lock_);
|
||||
#define MEM_DEBUG_SIZE_HISTORY_SIZE 4096
|
||||
int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE];
|
||||
#endif
|
||||
uint64 action_counter_ TF_GUARDED_BY(lock_);
|
||||
|
||||
// The circular buffer used to track memory operation history.
|
||||
static constexpr uint64 kMemDebugHistorySize = 4096;
|
||||
int64 size_history_[kMemDebugHistorySize];
|
||||
|
||||
friend class GPUBFCAllocatorPrivateMethodsTest;
|
||||
friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
|
||||
|
87
tensorflow/core/common_runtime/bfc_allocator_test.cc
Normal file
87
tensorflow/core/common_runtime/bfc_allocator_test.cc
Normal file
@ -0,0 +1,87 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/core/common_runtime/bfc_allocator.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
|
||||
#include "tensorflow/core/framework/allocator.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/platform/test_benchmark.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// A fake SubAllocator to test the performance of BFCAllocator.
|
||||
class FakeSubAllocator : public SubAllocator {
|
||||
public:
|
||||
FakeSubAllocator() : SubAllocator({}, {}), alloc_counter_(0) {}
|
||||
~FakeSubAllocator() override {}
|
||||
|
||||
// Alloc and Free functions are implemented as very cheap operations, so that
|
||||
// the benchmark can focus on the performance of BFCAllocator itself.
|
||||
void* Alloc(size_t alignment, size_t num_bytes,
|
||||
size_t* bytes_received) override {
|
||||
*bytes_received = num_bytes;
|
||||
return reinterpret_cast<void*>(alloc_counter_++);
|
||||
}
|
||||
|
||||
void Free(void* ptr, size_t num_bytes) override {}
|
||||
|
||||
bool SupportsCoalescing() const override { return false; }
|
||||
|
||||
private:
|
||||
int64 alloc_counter_;
|
||||
};
|
||||
|
||||
void BM_Allocator(::testing::benchmark::State& state) {
|
||||
constexpr int kAllocSize = 1 << 14;
|
||||
const int kLongLivedObjects = state.range(0);
|
||||
const int kShortLivedObjects = state.range(1);
|
||||
|
||||
FakeSubAllocator* sub_allocator = new FakeSubAllocator;
|
||||
BFCAllocator bfc_allocator(sub_allocator, 1 << 30, false, "GPU_0_bfc");
|
||||
|
||||
string test_op_name = "test_op";
|
||||
ScopedMemoryDebugAnnotation annotation(test_op_name.data());
|
||||
|
||||
// Allocate long lived objects.
|
||||
std::vector<void*> long_lived(kLongLivedObjects);
|
||||
for (int i = 0; i < kLongLivedObjects; i++) {
|
||||
long_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
|
||||
}
|
||||
std::vector<int> deallocation_order(kShortLivedObjects);
|
||||
for (int i = 0; i < kShortLivedObjects; i++) {
|
||||
deallocation_order[i] = i;
|
||||
}
|
||||
std::shuffle(deallocation_order.begin(), deallocation_order.end(),
|
||||
std::default_random_engine(0));
|
||||
|
||||
// Allocate and deallocate short lived objects.
|
||||
std::vector<void*> short_lived(kShortLivedObjects);
|
||||
for (auto _ : state) {
|
||||
for (int i = 0; i < kShortLivedObjects; i++) {
|
||||
short_lived[i] = bfc_allocator.AllocateRaw(1, kAllocSize);
|
||||
}
|
||||
for (int i = 0; i < kShortLivedObjects; i++) {
|
||||
bfc_allocator.DeallocateRaw(short_lived[deallocation_order[i]]);
|
||||
}
|
||||
}
|
||||
}
|
||||
BENCHMARK(BM_Allocator)
|
||||
->ArgPair(0, 256)
|
||||
->ArgPair(1000, 256)
|
||||
->ArgPair(10000, 256);
|
||||
|
||||
} // namespace tensorflow
|
Loading…
Reference in New Issue
Block a user