add a metric to track bfc allocator delay.

PiperOrigin-RevId: 323655357
Change-Id: Ic4be5e5af5634e2cb1be983f7f46f7f42cc72e36
This commit is contained in:
A. Unique TensorFlower 2020-07-28 14:34:30 -07:00 committed by TensorFlower Gardener
parent a78f101f8e
commit dd13d3b4c6
4 changed files with 44 additions and 0 deletions
tensorflow/core

View File

@ -1671,6 +1671,7 @@ cc_library(
deps = [
":shared_counter",
"//tensorflow/core:framework",
"//tensorflow/core:framework_internal",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:protos_all_cc",
@ -1678,6 +1679,7 @@ cc_library(
"//tensorflow/core/profiler/lib:traceme",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:optional",
],
)

View File

@ -14,6 +14,9 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/common_runtime/allocator_retry.h"
#include "absl/types/optional.h"
#include "tensorflow/core/framework/metrics.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/mutex.h"
@ -21,6 +24,28 @@ limitations under the License.
namespace tensorflow {
namespace {
class ScopedTimeTracker {
public:
explicit ScopedTimeTracker(Env* env) : env_(env) {}
void Enable() {
if (!start_us_) { // Only override start_us when not set yet.
start_us_ = env_->NowMicros();
}
}
~ScopedTimeTracker() {
if (start_us_) {
uint64 end_us = env_->NowMicros();
metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
}
}
private:
Env* env_;
absl::optional<uint64> start_us_;
};
} // namespace
AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
void* AllocatorRetry::AllocateRaw(
@ -31,6 +56,7 @@ void* AllocatorRetry::AllocateRaw(
if (num_bytes == 0) {
return nullptr;
}
ScopedTimeTracker tracker(env_);
uint64 deadline_micros = 0;
bool first = true;
void* ptr = nullptr;
@ -43,6 +69,7 @@ void* AllocatorRetry::AllocateRaw(
first = false;
}
if (now < deadline_micros) {
tracker.Enable();
mutex_lock l(mu_);
WaitForMilliseconds(&l, &memory_returned_,
(deadline_micros - now) / 1000);

View File

@ -148,6 +148,11 @@ auto* mlir_import_failure_count = monitoring::Counter<0>::New(
"/tensorflow/mlir/import_failure_count",
"The number of jobs that failed during mlir import or verification.");
auto* bfc_allocator_delay =
monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
"The total time spent running each graph "
"optimization pass in microseconds.");
} // namespace
void RecordTFDataAutotune(const string& name) {
@ -274,6 +279,13 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
}
}
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
if (delay_usecs > 0) {
bfc_allocator_delay_cell->IncrementBy(delay_usecs);
}
}
void IncrementMLIRImportFailureCount() {
static auto* mlir_import_failure_count_cell =
mlir_import_failure_count->GetCell();

View File

@ -120,6 +120,9 @@ void UpdateGrapplerPassTime(const string& pass_name,
// Updates the metrics stored about time XLA spents compiling graphs.
void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
// Updates the metrics stored about time BFC allocator spents during delay.
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
// Increment the number of jobs that failed during import to mlir.
void IncrementMLIRImportFailureCount();