From f35ec21e236d421991692177f20c85ab84fc5a3e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 11 Feb 2019 11:47:02 -0800 Subject: [PATCH] It does following two things: 1. filter out trace activities whose start_time is earlier than profiler session start_time. 2. Print a warning if multiple profiler session is created. PiperOrigin-RevId: 233448712 --- tensorflow/c/eager/c_api_experimental.cc | 4 +++ tensorflow/c/eager/c_api_experimental.h | 1 + tensorflow/c/eager/c_api_experimental_test.cc | 18 +++++++++++ .../core/profiler/lib/profiler_session.cc | 30 ++++++++++++++++++- .../core/profiler/lib/profiler_session.h | 3 ++ tensorflow/python/eager/profiler.py | 5 ++++ tensorflow/python/pywrap_tfe.i | 1 + 7 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc index e67e54b1529..f46562a21df 100644 --- a/tensorflow/c/eager/c_api_experimental.cc +++ b/tensorflow/c/eager/c_api_experimental.cc @@ -29,6 +29,10 @@ TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) { return new TFE_Profiler(ctx); } +bool TFE_ProfilerIsOk(TFE_Profiler* profiler) { + return profiler->profiler->Status().ok(); +} + void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; } void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler, diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h index ed653744381..e6a3d737073 100644 --- a/tensorflow/c/eager/c_api_experimental.h +++ b/tensorflow/c/eager/c_api_experimental.h @@ -33,6 +33,7 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, typedef struct TFE_Profiler TFE_Profiler; TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx); +TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler); TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler); // The output string is a binary string of tensorflow.tpu.Trace. User can write diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc index af55fee66e8..1994d652245 100644 --- a/tensorflow/c/eager/c_api_experimental_test.cc +++ b/tensorflow/c/eager/c_api_experimental_test.cc @@ -100,5 +100,23 @@ void ExecuteWithProfiling(bool async) { TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); } TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); } +TEST(CAPI, MultipleProfilerSession) { + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_ContextOptionsSetAsync(opts, static_cast(false)); + TFE_Context* ctx = TFE_NewContext(opts, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_Profiler* profiler1 = TFE_NewProfiler(ctx); + EXPECT_TRUE(TFE_ProfilerIsOk(profiler1)); + + TFE_Profiler* profiler2 = TFE_NewProfiler(ctx); + EXPECT_FALSE(TFE_ProfilerIsOk(profiler2)); + + TFE_DeleteProfiler(profiler1); + TFE_DeleteProfiler(profiler2); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc index 6ef779d5e1c..0abb63431e7 100644 --- a/tensorflow/core/profiler/lib/profiler_session.cc +++ b/tensorflow/core/profiler/lib/profiler_session.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h" #include "tensorflow/core/common_runtime/eager/context.h" +#include "tensorflow/core/lib/core/error_codes.pb.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" @@ -28,6 +29,12 @@ namespace tensorflow { namespace { +// Track whether there's an active ProfilerSession. +// Prevents another ProfilerSession from creating ProfilerInterface(s), as they +// use singletons that do not allow concurrent profiling request (e.g., +// DeviceTracer). +std::atomic session_active = ATOMIC_VAR_INIT(false); + void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata, tpu::Trace* trace, const uint64 profile_start_time_micros) { @@ -58,6 +65,9 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata, // Emit events. for (auto node : run_metadata->step_stats().dev_stats(device_id).node_stats()) { + if (node.all_start_micros() < profile_start_time_micros) { + continue; + } auto* event = trace->add_trace_events(); auto* args = event->mutable_args(); event->set_device_id(device_id); @@ -102,6 +112,12 @@ Status ProfilerSession::SerializeToString(string* content) { profiler->CollectData(&run_metadata).IgnoreError(); } + if (active_) { + // Allow another session to start. + session_active.store(false); + active_ = false; + } + tpu::Trace trace; ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_); @@ -111,7 +127,14 @@ Status ProfilerSession::SerializeToString(string* content) { } ProfilerSession::ProfilerSession(ProfilerContext* const context) - : start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) { + : active_(!session_active.exchange(true)), + start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) { + if (!active_) { + status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE, + "Another profiling session is active."); + return; + } + LOG(INFO) << "Profile Session started."; if (context->eager_context != nullptr) { @@ -131,6 +154,11 @@ ProfilerSession::~ProfilerSession() { for (auto& profiler : profilers_) { profiler->Stop().IgnoreError(); } + + if (active_) { + // Allow another session to start. + session_active.store(false); + } } } // namespace tensorflow diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h index e764852ae3d..07276571244 100644 --- a/tensorflow/core/profiler/lib/profiler_session.h +++ b/tensorflow/core/profiler/lib/profiler_session.h @@ -57,6 +57,9 @@ class ProfilerSession { std::vector> profilers_ GUARDED_BY(mutex_); + // True if the session is active. + bool active_ GUARDED_BY(mutex_); + tensorflow::Status status_ GUARDED_BY(mutex_); const uint64 start_time_micros_; mutex mutex_; diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py index eaba0fa2459..7ddd891cdcb 100644 --- a/tensorflow/python/eager/profiler.py +++ b/tensorflow/python/eager/profiler.py @@ -25,6 +25,7 @@ from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context from tensorflow.python.framework import c_api_util from tensorflow.python.platform import gfile +from tensorflow.python.platform import tf_logging as logging LOGDIR_PLUGIN = 'plugins/profile' @@ -46,6 +47,10 @@ def start(): raise AssertionError('Another profiler is running.') with _profiler_lock: _profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access + if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler): + logging.warning('Another profiler session is running which is probably ' + 'created by profiler server. Please avoid using profiler ' + 'server and profiler APIs at the same time.') def stop(): diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i index b19cc536b70..db6c14fb601 100755 --- a/tensorflow/python/pywrap_tfe.i +++ b/tensorflow/python/pywrap_tfe.i @@ -34,6 +34,7 @@ limitations under the License. %rename("%s") TFE_ContextAsyncWait; %rename("%s") TFE_ContextAsyncClearError; %rename("%s") TFE_NewProfiler; +%rename("%s") TFE_ProfilerIsOk; %rename("%s") TFE_DeleteProfiler; %rename("%s") TFE_ProfilerSerializeToString; %rename("%s") TFE_NewProfilerServerOptions;