add a metric to track bfc allocator delay.
PiperOrigin-RevId: 323655357 Change-Id: Ic4be5e5af5634e2cb1be983f7f46f7f42cc72e36
This commit is contained in:
parent
a78f101f8e
commit
dd13d3b4c6
tensorflow/core
@ -1671,6 +1671,7 @@ cc_library(
|
||||
deps = [
|
||||
":shared_counter",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:framework_internal",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
@ -1678,6 +1679,7 @@ cc_library(
|
||||
"//tensorflow/core/profiler/lib:traceme",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/types:optional",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -14,6 +14,9 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
|
||||
#include "absl/types/optional.h"
|
||||
#include "tensorflow/core/framework/metrics.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
@ -21,6 +24,28 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
namespace {
|
||||
class ScopedTimeTracker {
|
||||
public:
|
||||
explicit ScopedTimeTracker(Env* env) : env_(env) {}
|
||||
void Enable() {
|
||||
if (!start_us_) { // Only override start_us when not set yet.
|
||||
start_us_ = env_->NowMicros();
|
||||
}
|
||||
}
|
||||
~ScopedTimeTracker() {
|
||||
if (start_us_) {
|
||||
uint64 end_us = env_->NowMicros();
|
||||
metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Env* env_;
|
||||
absl::optional<uint64> start_us_;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
|
||||
|
||||
void* AllocatorRetry::AllocateRaw(
|
||||
@ -31,6 +56,7 @@ void* AllocatorRetry::AllocateRaw(
|
||||
if (num_bytes == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
ScopedTimeTracker tracker(env_);
|
||||
uint64 deadline_micros = 0;
|
||||
bool first = true;
|
||||
void* ptr = nullptr;
|
||||
@ -43,6 +69,7 @@ void* AllocatorRetry::AllocateRaw(
|
||||
first = false;
|
||||
}
|
||||
if (now < deadline_micros) {
|
||||
tracker.Enable();
|
||||
mutex_lock l(mu_);
|
||||
WaitForMilliseconds(&l, &memory_returned_,
|
||||
(deadline_micros - now) / 1000);
|
||||
|
@ -148,6 +148,11 @@ auto* mlir_import_failure_count = monitoring::Counter<0>::New(
|
||||
"/tensorflow/mlir/import_failure_count",
|
||||
"The number of jobs that failed during mlir import or verification.");
|
||||
|
||||
auto* bfc_allocator_delay =
|
||||
monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
|
||||
"The total time spent running each graph "
|
||||
"optimization pass in microseconds.");
|
||||
|
||||
} // namespace
|
||||
|
||||
void RecordTFDataAutotune(const string& name) {
|
||||
@ -274,6 +279,13 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
|
||||
}
|
||||
}
|
||||
|
||||
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
|
||||
static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
|
||||
if (delay_usecs > 0) {
|
||||
bfc_allocator_delay_cell->IncrementBy(delay_usecs);
|
||||
}
|
||||
}
|
||||
|
||||
void IncrementMLIRImportFailureCount() {
|
||||
static auto* mlir_import_failure_count_cell =
|
||||
mlir_import_failure_count->GetCell();
|
||||
|
@ -120,6 +120,9 @@ void UpdateGrapplerPassTime(const string& pass_name,
|
||||
// Updates the metrics stored about time XLA spents compiling graphs.
|
||||
void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
|
||||
|
||||
// Updates the metrics stored about time BFC allocator spents during delay.
|
||||
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
|
||||
|
||||
// Increment the number of jobs that failed during import to mlir.
|
||||
void IncrementMLIRImportFailureCount();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user