add a metric to track bfc allocator delay.
PiperOrigin-RevId: 323655357 Change-Id: Ic4be5e5af5634e2cb1be983f7f46f7f42cc72e36
This commit is contained in:
parent
a78f101f8e
commit
dd13d3b4c6
@ -1671,6 +1671,7 @@ cc_library(
|
|||||||
deps = [
|
deps = [
|
||||||
":shared_counter",
|
":shared_counter",
|
||||||
"//tensorflow/core:framework",
|
"//tensorflow/core:framework",
|
||||||
|
"//tensorflow/core:framework_internal",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
"//tensorflow/core:lib_internal",
|
||||||
"//tensorflow/core:protos_all_cc",
|
"//tensorflow/core:protos_all_cc",
|
||||||
@ -1678,6 +1679,7 @@ cc_library(
|
|||||||
"//tensorflow/core/profiler/lib:traceme",
|
"//tensorflow/core/profiler/lib:traceme",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
"@com_google_absl//absl/strings",
|
"@com_google_absl//absl/strings",
|
||||||
|
"@com_google_absl//absl/types:optional",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -14,6 +14,9 @@ limitations under the License.
|
|||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||||
|
|
||||||
|
#include "absl/types/optional.h"
|
||||||
|
#include "tensorflow/core/framework/metrics.h"
|
||||||
#include "tensorflow/core/platform/env.h"
|
#include "tensorflow/core/platform/env.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
@ -21,6 +24,28 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
class ScopedTimeTracker {
|
||||||
|
public:
|
||||||
|
explicit ScopedTimeTracker(Env* env) : env_(env) {}
|
||||||
|
void Enable() {
|
||||||
|
if (!start_us_) { // Only override start_us when not set yet.
|
||||||
|
start_us_ = env_->NowMicros();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
~ScopedTimeTracker() {
|
||||||
|
if (start_us_) {
|
||||||
|
uint64 end_us = env_->NowMicros();
|
||||||
|
metrics::UpdateBfcAllocatorDelayTime(end_us - *start_us_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Env* env_;
|
||||||
|
absl::optional<uint64> start_us_;
|
||||||
|
};
|
||||||
|
} // namespace
|
||||||
|
|
||||||
AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
|
AllocatorRetry::AllocatorRetry() : env_(Env::Default()) {}
|
||||||
|
|
||||||
void* AllocatorRetry::AllocateRaw(
|
void* AllocatorRetry::AllocateRaw(
|
||||||
@ -31,6 +56,7 @@ void* AllocatorRetry::AllocateRaw(
|
|||||||
if (num_bytes == 0) {
|
if (num_bytes == 0) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
ScopedTimeTracker tracker(env_);
|
||||||
uint64 deadline_micros = 0;
|
uint64 deadline_micros = 0;
|
||||||
bool first = true;
|
bool first = true;
|
||||||
void* ptr = nullptr;
|
void* ptr = nullptr;
|
||||||
@ -43,6 +69,7 @@ void* AllocatorRetry::AllocateRaw(
|
|||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
if (now < deadline_micros) {
|
if (now < deadline_micros) {
|
||||||
|
tracker.Enable();
|
||||||
mutex_lock l(mu_);
|
mutex_lock l(mu_);
|
||||||
WaitForMilliseconds(&l, &memory_returned_,
|
WaitForMilliseconds(&l, &memory_returned_,
|
||||||
(deadline_micros - now) / 1000);
|
(deadline_micros - now) / 1000);
|
||||||
|
@ -148,6 +148,11 @@ auto* mlir_import_failure_count = monitoring::Counter<0>::New(
|
|||||||
"/tensorflow/mlir/import_failure_count",
|
"/tensorflow/mlir/import_failure_count",
|
||||||
"The number of jobs that failed during mlir import or verification.");
|
"The number of jobs that failed during mlir import or verification.");
|
||||||
|
|
||||||
|
auto* bfc_allocator_delay =
|
||||||
|
monitoring::Counter<0>::New("/tensorflow/core/bfc_allocator_delay",
|
||||||
|
"The total time spent running each graph "
|
||||||
|
"optimization pass in microseconds.");
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
void RecordTFDataAutotune(const string& name) {
|
void RecordTFDataAutotune(const string& name) {
|
||||||
@ -274,6 +279,13 @@ void UpdateXlaCompilationTime(const uint64 compilation_time_usecs) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs) {
|
||||||
|
static auto* bfc_allocator_delay_cell = bfc_allocator_delay->GetCell();
|
||||||
|
if (delay_usecs > 0) {
|
||||||
|
bfc_allocator_delay_cell->IncrementBy(delay_usecs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void IncrementMLIRImportFailureCount() {
|
void IncrementMLIRImportFailureCount() {
|
||||||
static auto* mlir_import_failure_count_cell =
|
static auto* mlir_import_failure_count_cell =
|
||||||
mlir_import_failure_count->GetCell();
|
mlir_import_failure_count->GetCell();
|
||||||
|
@ -120,6 +120,9 @@ void UpdateGrapplerPassTime(const string& pass_name,
|
|||||||
// Updates the metrics stored about time XLA spents compiling graphs.
|
// Updates the metrics stored about time XLA spents compiling graphs.
|
||||||
void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
|
void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
|
||||||
|
|
||||||
|
// Updates the metrics stored about time BFC allocator spents during delay.
|
||||||
|
void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
|
||||||
|
|
||||||
// Increment the number of jobs that failed during import to mlir.
|
// Increment the number of jobs that failed during import to mlir.
|
||||||
void IncrementMLIRImportFailureCount();
|
void IncrementMLIRImportFailureCount();
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user