It does following two things:
1. filter out trace activities whose start_time is earlier than profiler session start_time. 2. Print a warning if multiple profiler session is created. PiperOrigin-RevId: 233448712
This commit is contained in:
parent
08703e1aad
commit
f35ec21e23
@ -29,6 +29,10 @@ TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx) {
|
||||
return new TFE_Profiler(ctx);
|
||||
}
|
||||
|
||||
bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
|
||||
return profiler->profiler->Status().ok();
|
||||
}
|
||||
|
||||
void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
|
||||
|
||||
void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
|
||||
|
@ -33,6 +33,7 @@ TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
|
||||
typedef struct TFE_Profiler TFE_Profiler;
|
||||
|
||||
TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_Context* ctx);
|
||||
TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
|
||||
TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
|
||||
|
||||
// The output string is a binary string of tensorflow.tpu.Trace. User can write
|
||||
|
@ -100,5 +100,23 @@ void ExecuteWithProfiling(bool async) {
|
||||
TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
|
||||
TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
|
||||
|
||||
TEST(CAPI, MultipleProfilerSession) {
|
||||
TF_Status* status = TF_NewStatus();
|
||||
TFE_ContextOptions* opts = TFE_NewContextOptions();
|
||||
TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
|
||||
TFE_Context* ctx = TFE_NewContext(opts, status);
|
||||
CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
|
||||
TFE_DeleteContextOptions(opts);
|
||||
|
||||
TFE_Profiler* profiler1 = TFE_NewProfiler(ctx);
|
||||
EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
|
||||
|
||||
TFE_Profiler* profiler2 = TFE_NewProfiler(ctx);
|
||||
EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
|
||||
|
||||
TFE_DeleteProfiler(profiler1);
|
||||
TFE_DeleteProfiler(profiler2);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
#include <string>
|
||||
#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
|
||||
#include "tensorflow/core/common_runtime/eager/context.h"
|
||||
#include "tensorflow/core/lib/core/error_codes.pb.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
@ -28,6 +29,12 @@ namespace tensorflow {
|
||||
|
||||
namespace {
|
||||
|
||||
// Track whether there's an active ProfilerSession.
|
||||
// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
|
||||
// use singletons that do not allow concurrent profiling request (e.g.,
|
||||
// DeviceTracer).
|
||||
std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
|
||||
|
||||
void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
|
||||
tpu::Trace* trace,
|
||||
const uint64 profile_start_time_micros) {
|
||||
@ -58,6 +65,9 @@ void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
|
||||
// Emit events.
|
||||
for (auto node :
|
||||
run_metadata->step_stats().dev_stats(device_id).node_stats()) {
|
||||
if (node.all_start_micros() < profile_start_time_micros) {
|
||||
continue;
|
||||
}
|
||||
auto* event = trace->add_trace_events();
|
||||
auto* args = event->mutable_args();
|
||||
event->set_device_id(device_id);
|
||||
@ -102,6 +112,12 @@ Status ProfilerSession::SerializeToString(string* content) {
|
||||
profiler->CollectData(&run_metadata).IgnoreError();
|
||||
}
|
||||
|
||||
if (active_) {
|
||||
// Allow another session to start.
|
||||
session_active.store(false);
|
||||
active_ = false;
|
||||
}
|
||||
|
||||
tpu::Trace trace;
|
||||
|
||||
ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
|
||||
@ -111,7 +127,14 @@ Status ProfilerSession::SerializeToString(string* content) {
|
||||
}
|
||||
|
||||
ProfilerSession::ProfilerSession(ProfilerContext* const context)
|
||||
: start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
|
||||
: active_(!session_active.exchange(true)),
|
||||
start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
|
||||
if (!active_) {
|
||||
status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
|
||||
"Another profiling session is active.");
|
||||
return;
|
||||
}
|
||||
|
||||
LOG(INFO) << "Profile Session started.";
|
||||
|
||||
if (context->eager_context != nullptr) {
|
||||
@ -131,6 +154,11 @@ ProfilerSession::~ProfilerSession() {
|
||||
for (auto& profiler : profilers_) {
|
||||
profiler->Stop().IgnoreError();
|
||||
}
|
||||
|
||||
if (active_) {
|
||||
// Allow another session to start.
|
||||
session_active.store(false);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -57,6 +57,9 @@ class ProfilerSession {
|
||||
std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
|
||||
profilers_ GUARDED_BY(mutex_);
|
||||
|
||||
// True if the session is active.
|
||||
bool active_ GUARDED_BY(mutex_);
|
||||
|
||||
tensorflow::Status status_ GUARDED_BY(mutex_);
|
||||
const uint64 start_time_micros_;
|
||||
mutex mutex_;
|
||||
|
@ -25,6 +25,7 @@ from tensorflow.python import pywrap_tensorflow
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.framework import c_api_util
|
||||
from tensorflow.python.platform import gfile
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
|
||||
LOGDIR_PLUGIN = 'plugins/profile'
|
||||
|
||||
@ -46,6 +47,10 @@ def start():
|
||||
raise AssertionError('Another profiler is running.')
|
||||
with _profiler_lock:
|
||||
_profiler = pywrap_tensorflow.TFE_NewProfiler(context.context()._handle) # pylint: disable=protected-access
|
||||
if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
|
||||
logging.warning('Another profiler session is running which is probably '
|
||||
'created by profiler server. Please avoid using profiler '
|
||||
'server and profiler APIs at the same time.')
|
||||
|
||||
|
||||
def stop():
|
||||
|
@ -34,6 +34,7 @@ limitations under the License.
|
||||
%rename("%s") TFE_ContextAsyncWait;
|
||||
%rename("%s") TFE_ContextAsyncClearError;
|
||||
%rename("%s") TFE_NewProfiler;
|
||||
%rename("%s") TFE_ProfilerIsOk;
|
||||
%rename("%s") TFE_DeleteProfiler;
|
||||
%rename("%s") TFE_ProfilerSerializeToString;
|
||||
%rename("%s") TFE_NewProfilerServerOptions;
|
||||
|
Loading…
Reference in New Issue
Block a user