Add profiler service to eager context so that it is started by default.

PiperOrigin-RevId: 294348740 Change-Id: I99eb430c8a5c1c35ad442987a7c50af3f1f92e29
2020-02-10 18:06:41 -08:00 · 2020-02-10 18:06:41 -08:00 · 1a287cbaee
commit 1a287cbaee
parent 114762ab9e
7 changed files with 61 additions and 9 deletions
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@ -48,9 +48,11 @@ void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
 }

 void TFE_StartProfilerServer(int port) {
-  // Release child thread intentionally. The child thread can be terminated by
-  // terminating the main thread.
-  tensorflow::StartProfilerServer(port).release();
+  auto profiler_server = absl::make_unique<tensorflow::ProfilerServer>();
+  profiler_server->StartProfilerServer(port);
+  // Release child server thread intentionally. The child thread can be
+  // terminated when the main program exits.
+  profiler_server.release();
 }

 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@ -77,6 +77,7 @@ tf_cuda_library(
            "//tensorflow/core/distributed_runtime:server_lib",
            "//tensorflow/core/distributed_runtime:worker_session",
            "//tensorflow/core/distributed_runtime/eager:eager_client",
+            "//tensorflow/core/profiler/rpc:profiler_server",
        ],
    }),
 )
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
 #endif  // !IS_MOBILE_PLATFORM
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@ -110,6 +111,8 @@ EagerContext::EagerContext(

 #if !defined(IS_MOBILE_PLATFORM)
  context_id_ = kInvalidContextId;
+  profiler_server_ = absl::make_unique<ProfilerServer>();
+  profiler_server_->MaybeStartProfilerServer();
 #endif  // IS_MOBILE_PLATFORM

  std::unique_ptr<DeviceResolverInterface> drl(
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@ -73,6 +73,8 @@ namespace eager {
 class RemoteMgr;
 }  // namespace eager

+class ProfilerServer;
+
 // LINT.IfChange
 // Note: Keep in sync with exported copy of enum in eager/c_api.h.
 enum ContextDevicePlacementPolicy {
@ -599,6 +601,9 @@ class EagerContext : public core::RefCounted {
  std::shared_ptr<WorkerSession> worker_session_;
  std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;

+  // Starts a thread for profiling service.
+  std::unique_ptr<ProfilerServer> profiler_server_;
+
  mutex remote_state_mu_;

  uint64 context_id_ GUARDED_BY(remote_state_mu_);
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@ -30,6 +30,7 @@ cc_library(
        "//tensorflow:grpc++",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
        "//tensorflow/core/profiler:profiler_service_proto_cc",
        "@com_google_absl//absl/strings",
    ],
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@ -23,13 +23,14 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/ptr_util.h"

 namespace tensorflow {

-std::unique_ptr<Thread> StartProfilerServer(int32 port) {
+void ProfilerServer::StartProfilerServer(int32 port) {
  Env* env = Env::Default();
-  return WrapUnique(env->StartThread({}, "profiler server", [port]() {
+  auto start_server = [port, this]() {
    string server_address = absl::StrCat("0.0.0.0:", port);
    std::unique_ptr<grpc::ProfilerService::Service> service =
        CreateProfilerService();
@ -37,10 +38,36 @@ std::unique_ptr<Thread> StartProfilerServer(int32 port) {
    builder.AddListeningPort(server_address,
                             ::grpc::InsecureServerCredentials());
    builder.RegisterService(service.get());
-    std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+    server_ = builder.BuildAndStart();
    LOG(INFO) << "Profiling Server listening on " << server_address;
-    server->Wait();
-  }));
+    server_->Wait();
+  };
+  server_thread_ =
+      WrapUnique(env->StartThread({}, "ProfilerServer", start_server));
+}
+
+void ProfilerServer::MaybeStartProfilerServer() {
+  int64 profiler_port;
+  // The implementation of ReadInt64FromEnvVar guaranteed that the output
+  // argument will be set to default value failure.
+  Status s = ReadInt64FromEnvVar("TF_PROFILER_PORT", -1, &profiler_port);
+  if (!s.ok()) {
+    LOG(WARNING) << "StartProfilerServer: " << s.error_message();
+  }
+  if (profiler_port < 1024 || profiler_port > 49151) {
+    // Disable the log message if profiler_port is -1 to prevent spam the
+    // terminal for TF user who doesn't set a profiler port.
+    if (profiler_port == -1) return;
+    LOG(WARNING)
+        << "Profiler server not started. TF_PROFILER_PORT: " << profiler_port
+        << " is out of the valid registered port range (1024 to 49151).";
+    return;
+  }
+  StartProfilerServer(profiler_port);
+}
+
+ProfilerServer::~ProfilerServer() {
+  if (server_) server_->Shutdown();
 }

 }  // namespace tensorflow
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@ -17,13 +17,26 @@ limitations under the License.

 #include <memory>

+#include "grpcpp/grpcpp.h"
 #include "tensorflow/core/platform/types.h"

 namespace tensorflow {

 class Thread;

-std::unique_ptr<Thread> StartProfilerServer(int32 port);
+class ProfilerServer {
+ public:
+  ~ProfilerServer();
+  // If TF_PROFILER_PORT is defined, starts a profiler server with the
+  // specified port. Otherwise, don't start a profiler server
+  void MaybeStartProfilerServer();
+  // Starts a profiler server with a given port.
+  void StartProfilerServer(int32 port);
+
+ private:
+  std::unique_ptr<::grpc::Server> server_;
+  std::unique_ptr<Thread> server_thread_;
+};

 }  // namespace tensorflow