It does following two things:

1. filter out trace activities whose start_time is earlier than profiler session start_time.
2. Print a warning if multiple profiler session is created.

PiperOrigin-RevId: 233448712
This commit is contained in:
A. Unique TensorFlower 2019-02-11 11:47:02 -08:00 committed by TensorFlower Gardener
parent 08703e1aad
commit f35ec21e23
7 changed files with 61 additions and 1 deletions

View File

@ -29,6 +29,10 @@ TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
return new TFE_Profiler(ctx); return new TFE_Profiler(ctx);
} }
bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
return profiler->profiler->Status().ok();
}
void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; } void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler, void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,

View File

@ -33,6 +33,7 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
typedef struct TFE_Profiler TFE_Profiler; typedef struct TFE_Profiler TFE_Profiler;
TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx); TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler); TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
// The output string is a binary string of tensorflow.tpu.Trace. User can write // The output string is a binary string of tensorflow.tpu.Trace. User can write

View File

@ -100,5 +100,23 @@ void ExecuteWithProfiling(bool async) {
TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); } TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); } TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
TEST(CAPI, MultipleProfilerSession) {
TF_Status* status = TF_NewStatus();
TFE_ContextOptions* opts = TFE_NewContextOptions();
TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
TFE_Context* ctx = TFE_NewContext(opts, status);
CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
TFE_DeleteContextOptions(opts);
TFE_Profiler* profiler1 = TFE_NewProfiler(ctx);
EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
TFE_Profiler* profiler2 = TFE_NewProfiler(ctx);
EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
TFE_DeleteProfiler(profiler1);
TFE_DeleteProfiler(profiler2);
}
} // namespace } // namespace
} // namespace tensorflow } // namespace tensorflow

View File

@ -17,6 +17,7 @@ limitations under the License.
#include <string> #include <string>
#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h" #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
#include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/context.h"
#include "tensorflow/core/lib/core/error_codes.pb.h"
#include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/types.h" #include "tensorflow/core/platform/types.h"
@ -28,6 +29,12 @@ namespace tensorflow {
namespace { namespace {
// Track whether there's an active ProfilerSession.
// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
// use singletons that do not allow concurrent profiling request (e.g.,
// DeviceTracer).
std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata, void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
tpu::Trace* trace, tpu::Trace* trace,
const uint64 profile_start_time_micros) { const uint64 profile_start_time_micros) {
@ -58,6 +65,9 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
// Emit events. // Emit events.
for (auto node : for (auto node :
run_metadata->step_stats().dev_stats(device_id).node_stats()) { run_metadata->step_stats().dev_stats(device_id).node_stats()) {
if (node.all_start_micros() < profile_start_time_micros) {
continue;
}
auto* event = trace->add_trace_events(); auto* event = trace->add_trace_events();
auto* args = event->mutable_args(); auto* args = event->mutable_args();
event->set_device_id(device_id); event->set_device_id(device_id);
@ -102,6 +112,12 @@ Status ProfilerSession::SerializeToString(string* content) {
profiler->CollectData(&run_metadata).IgnoreError(); profiler->CollectData(&run_metadata).IgnoreError();
} }
if (active_) {
// Allow another session to start.
session_active.store(false);
active_ = false;
}
tpu::Trace trace; tpu::Trace trace;
ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_); ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
@ -111,7 +127,14 @@ Status ProfilerSession::SerializeToString(string* content) {
} }
ProfilerSession::ProfilerSession(ProfilerContext* const context) ProfilerSession::ProfilerSession(ProfilerContext* const context)
: start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) { : active_(!session_active.exchange(true)),
start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
if (!active_) {
status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
"Another profiling session is active.");
return;
}
LOG(INFO) << "Profile Session started."; LOG(INFO) << "Profile Session started.";
if (context->eager_context != nullptr) { if (context->eager_context != nullptr) {
@ -131,6 +154,11 @@ ProfilerSession::~ProfilerSession() {
for (auto& profiler : profilers_) { for (auto& profiler : profilers_) {
profiler->Stop().IgnoreError(); profiler->Stop().IgnoreError();
} }
if (active_) {
// Allow another session to start.
session_active.store(false);
}
} }
} // namespace tensorflow } // namespace tensorflow

View File

@ -57,6 +57,9 @@ class ProfilerSession {
std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>> std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
profilers_ GUARDED_BY(mutex_); profilers_ GUARDED_BY(mutex_);
// True if the session is active.
bool active_ GUARDED_BY(mutex_);
tensorflow::Status status_ GUARDED_BY(mutex_); tensorflow::Status status_ GUARDED_BY(mutex_);
const uint64 start_time_micros_; const uint64 start_time_micros_;
mutex mutex_; mutex mutex_;

View File

@ -25,6 +25,7 @@ from tensorflow.python import pywrap_tensorflow
from tensorflow.python.eager import context from tensorflow.python.eager import context
from tensorflow.python.framework import c_api_util from tensorflow.python.framework import c_api_util
from tensorflow.python.platform import gfile from tensorflow.python.platform import gfile
from tensorflow.python.platform import tf_logging as logging
LOGDIR_PLUGIN = 'plugins/profile' LOGDIR_PLUGIN = 'plugins/profile'
@ -46,6 +47,10 @@ def start():
raise AssertionError('Another profiler is running.') raise AssertionError('Another profiler is running.')
with _profiler_lock: with _profiler_lock:
_profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access _profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access
if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
logging.warning('Another profiler session is running which is probably '
'created by profiler server. Please avoid using profiler '
'server and profiler APIs at the same time.')
def stop(): def stop():

View File

@ -34,6 +34,7 @@ limitations under the License.
%rename("%s") TFE_ContextAsyncWait; %rename("%s") TFE_ContextAsyncWait;
%rename("%s") TFE_ContextAsyncClearError; %rename("%s") TFE_ContextAsyncClearError;
%rename("%s") TFE_NewProfiler; %rename("%s") TFE_NewProfiler;
%rename("%s") TFE_ProfilerIsOk;
%rename("%s") TFE_DeleteProfiler; %rename("%s") TFE_DeleteProfiler;
%rename("%s") TFE_ProfilerSerializeToString; %rename("%s") TFE_ProfilerSerializeToString;
%rename("%s") TFE_NewProfilerServerOptions; %rename("%s") TFE_NewProfilerServerOptions;