From a4d22c8d01610b4d6c5c5d1017755c04f86429c1 Mon Sep 17 00:00:00 2001 From: "tongxuan.ltx" Date: Mon, 20 Apr 2020 19:09:47 +0800 Subject: [PATCH 1/4] Support options(environment variable) to enable grpc reuse port. ReusePort scenario: parent process occupies the port, then share the port through service such as ZooKeeper, and then child process (TensorFlow process) reuse the port. --- .../rpc/grpc_server_lib.cc | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 32083fc272f..aba9fe03d40 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -70,6 +70,18 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption { plugins) override {} }; +// Define an option subclass in order to disable SO_REUSEPORT for the +// server socket. +class ReusePortOption : public ::grpc::ServerBuilderOption { + public: + void UpdateArguments(::grpc::ChannelArguments* args) override { + args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 1); + } + + void UpdatePlugins(std::vector>* + plugins) override {} +}; + // static utility function RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) { return new RpcRendezvousMgr(env); @@ -220,8 +232,14 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) { GetServerCredentials(server_def_), &bound_port_); builder.SetMaxMessageSize(std::numeric_limits::max()); - builder.SetOption( - std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); + bool reuse_port = false; + ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port) + .IgnoreError(); + auto server_build_option = reuse_port ? + std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) : + std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption); + builder.SetOption(server_build_option); + // Allow subclasses to specify more args to pass to the gRPC server. MaybeMutateBuilder(&builder); master_impl_ = CreateMaster(&master_env_); From bbe13474e71eb2694be5050e22759d3bc5307026 Mon Sep 17 00:00:00 2001 From: "tongxuan.ltx" Date: Mon, 20 Apr 2020 23:12:36 +0800 Subject: [PATCH 2/4] fix typo --- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index aba9fe03d40..6555ded82da 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -70,7 +70,7 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption { plugins) override {} }; -// Define an option subclass in order to disable SO_REUSEPORT for the +// Define an option subclass in order to enable SO_REUSEPORT for the // server socket. class ReusePortOption : public ::grpc::ServerBuilderOption { public: From 231edfa4184dfd9cd2dce7a24d089d079ffffc6d Mon Sep 17 00:00:00 2001 From: "tongxuan.ltx" Date: Thu, 23 Apr 2020 10:13:45 +0800 Subject: [PATCH 3/4] fix build break --- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 6555ded82da..b2efed619a4 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -238,7 +238,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) { auto server_build_option = reuse_port ? std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) : std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption); - builder.SetOption(server_build_option); + builder.SetOption(std::move(server_build_option)); // Allow subclasses to specify more args to pass to the gRPC server. MaybeMutateBuilder(&builder); From fe3c1035c22eebad69d5fbe85e987f66d5a40a2c Mon Sep 17 00:00:00 2001 From: "tongxuan.ltx" Date: Sat, 2 May 2020 08:11:06 +0000 Subject: [PATCH 4/4] Check return status of reading environment variable --- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index b2efed619a4..ccaf0af213b 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -233,8 +233,11 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) { builder.SetMaxMessageSize(std::numeric_limits::max()); bool reuse_port = false; - ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port) - .IgnoreError(); + const Status status = ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, + &reuse_port); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + } auto server_build_option = reuse_port ? std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) : std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);