It does following two things:
1. filter out trace activities whose start_time is earlier than profiler session start_time. 2. Print a warning if multiple profiler session is created. PiperOrigin-RevId: 233448712
This commit is contained in:
parent
08703e1aad
commit
f35ec21e23
@ -29,6 +29,10 @@ TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
|
|||||||
return new TFE_Profiler(ctx);
|
return new TFE_Profiler(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
|
||||||
|
return profiler->profiler->Status().ok();
|
||||||
|
}
|
||||||
|
|
||||||
void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
|
void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
|
||||||
|
|
||||||
void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
|
void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
|
||||||
|
@ -33,6 +33,7 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
|
|||||||
typedef struct TFE_Profiler TFE_Profiler;
|
typedef struct TFE_Profiler TFE_Profiler;
|
||||||
|
|
||||||
TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
|
TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
|
||||||
|
TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
|
||||||
TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
|
TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
|
||||||
|
|
||||||
// The output string is a binary string of tensorflow.tpu.Trace. User can write
|
// The output string is a binary string of tensorflow.tpu.Trace. User can write
|
||||||
|
@ -100,5 +100,23 @@ void ExecuteWithProfiling(bool async) {
|
|||||||
TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
|
TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
|
||||||
TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
|
TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
|
||||||
|
|
||||||
|
TEST(CAPI, MultipleProfilerSession) {
|
||||||
|
TF_Status* status = TF_NewStatus();
|
||||||
|
TFE_ContextOptions* opts = TFE_NewContextOptions();
|
||||||
|
TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
|
||||||
|
TFE_Context* ctx = TFE_NewContext(opts, status);
|
||||||
|
CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
|
||||||
|
TFE_DeleteContextOptions(opts);
|
||||||
|
|
||||||
|
TFE_Profiler* profiler1 = TFE_NewProfiler(ctx);
|
||||||
|
EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
|
||||||
|
|
||||||
|
TFE_Profiler* profiler2 = TFE_NewProfiler(ctx);
|
||||||
|
EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
|
||||||
|
|
||||||
|
TFE_DeleteProfiler(profiler1);
|
||||||
|
TFE_DeleteProfiler(profiler2);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
|
#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
|
||||||
#include "tensorflow/core/common_runtime/eager/context.h"
|
#include "tensorflow/core/common_runtime/eager/context.h"
|
||||||
|
#include "tensorflow/core/lib/core/error_codes.pb.h"
|
||||||
#include "tensorflow/core/platform/env.h"
|
#include "tensorflow/core/platform/env.h"
|
||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
@ -28,6 +29,12 @@ namespace tensorflow {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
// Track whether there's an active ProfilerSession.
|
||||||
|
// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
|
||||||
|
// use singletons that do not allow concurrent profiling request (e.g.,
|
||||||
|
// DeviceTracer).
|
||||||
|
std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
|
||||||
|
|
||||||
void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
|
void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
|
||||||
tpu::Trace* trace,
|
tpu::Trace* trace,
|
||||||
const uint64 profile_start_time_micros) {
|
const uint64 profile_start_time_micros) {
|
||||||
@ -58,6 +65,9 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
|
|||||||
// Emit events.
|
// Emit events.
|
||||||
for (auto node :
|
for (auto node :
|
||||||
run_metadata->step_stats().dev_stats(device_id).node_stats()) {
|
run_metadata->step_stats().dev_stats(device_id).node_stats()) {
|
||||||
|
if (node.all_start_micros() < profile_start_time_micros) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
auto* event = trace->add_trace_events();
|
auto* event = trace->add_trace_events();
|
||||||
auto* args = event->mutable_args();
|
auto* args = event->mutable_args();
|
||||||
event->set_device_id(device_id);
|
event->set_device_id(device_id);
|
||||||
@ -102,6 +112,12 @@ Status ProfilerSession::SerializeToString(string* content) {
|
|||||||
profiler->CollectData(&run_metadata).IgnoreError();
|
profiler->CollectData(&run_metadata).IgnoreError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (active_) {
|
||||||
|
// Allow another session to start.
|
||||||
|
session_active.store(false);
|
||||||
|
active_ = false;
|
||||||
|
}
|
||||||
|
|
||||||
tpu::Trace trace;
|
tpu::Trace trace;
|
||||||
|
|
||||||
ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
|
ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
|
||||||
@ -111,7 +127,14 @@ Status ProfilerSession::SerializeToString(string* content) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ProfilerSession::ProfilerSession(ProfilerContext* const context)
|
ProfilerSession::ProfilerSession(ProfilerContext* const context)
|
||||||
: start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
|
: active_(!session_active.exchange(true)),
|
||||||
|
start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
|
||||||
|
if (!active_) {
|
||||||
|
status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
|
||||||
|
"Another profiling session is active.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
LOG(INFO) << "Profile Session started.";
|
LOG(INFO) << "Profile Session started.";
|
||||||
|
|
||||||
if (context->eager_context != nullptr) {
|
if (context->eager_context != nullptr) {
|
||||||
@ -131,6 +154,11 @@ ProfilerSession::~ProfilerSession() {
|
|||||||
for (auto& profiler : profilers_) {
|
for (auto& profiler : profilers_) {
|
||||||
profiler->Stop().IgnoreError();
|
profiler->Stop().IgnoreError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (active_) {
|
||||||
|
// Allow another session to start.
|
||||||
|
session_active.store(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -57,6 +57,9 @@ class ProfilerSession {
|
|||||||
std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
|
std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
|
||||||
profilers_ GUARDED_BY(mutex_);
|
profilers_ GUARDED_BY(mutex_);
|
||||||
|
|
||||||
|
// True if the session is active.
|
||||||
|
bool active_ GUARDED_BY(mutex_);
|
||||||
|
|
||||||
tensorflow::Status status_ GUARDED_BY(mutex_);
|
tensorflow::Status status_ GUARDED_BY(mutex_);
|
||||||
const uint64 start_time_micros_;
|
const uint64 start_time_micros_;
|
||||||
mutex mutex_;
|
mutex mutex_;
|
||||||
|
@ -25,6 +25,7 @@ from tensorflow.python import pywrap_tensorflow
|
|||||||
from tensorflow.python.eager import context
|
from tensorflow.python.eager import context
|
||||||
from tensorflow.python.framework import c_api_util
|
from tensorflow.python.framework import c_api_util
|
||||||
from tensorflow.python.platform import gfile
|
from tensorflow.python.platform import gfile
|
||||||
|
from tensorflow.python.platform import tf_logging as logging
|
||||||
|
|
||||||
LOGDIR_PLUGIN = 'plugins/profile'
|
LOGDIR_PLUGIN = 'plugins/profile'
|
||||||
|
|
||||||
@ -46,6 +47,10 @@ def start():
|
|||||||
raise AssertionError('Another profiler is running.')
|
raise AssertionError('Another profiler is running.')
|
||||||
with _profiler_lock:
|
with _profiler_lock:
|
||||||
_profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access
|
_profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access
|
||||||
|
if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
|
||||||
|
logging.warning('Another profiler session is running which is probably '
|
||||||
|
'created by profiler server. Please avoid using profiler '
|
||||||
|
'server and profiler APIs at the same time.')
|
||||||
|
|
||||||
|
|
||||||
def stop():
|
def stop():
|
||||||
|
@ -34,6 +34,7 @@ limitations under the License.
|
|||||||
%rename("%s") TFE_ContextAsyncWait;
|
%rename("%s") TFE_ContextAsyncWait;
|
||||||
%rename("%s") TFE_ContextAsyncClearError;
|
%rename("%s") TFE_ContextAsyncClearError;
|
||||||
%rename("%s") TFE_NewProfiler;
|
%rename("%s") TFE_NewProfiler;
|
||||||
|
%rename("%s") TFE_ProfilerIsOk;
|
||||||
%rename("%s") TFE_DeleteProfiler;
|
%rename("%s") TFE_DeleteProfiler;
|
||||||
%rename("%s") TFE_ProfilerSerializeToString;
|
%rename("%s") TFE_ProfilerSerializeToString;
|
||||||
%rename("%s") TFE_NewProfilerServerOptions;
|
%rename("%s") TFE_NewProfilerServerOptions;
|
||||||
|
Loading…
Reference in New Issue
Block a user