Fix two memory leaks and enable asan for C API remote tests.

PiperOrigin-RevId: 321801325 Change-Id: Id579f93e167c9665b4ca740eee160da801ca0694
2020-07-17 10:10:53 -07:00 · 2020-07-17 10:10:53 -07:00 · 5c4b8790ca
commit 5c4b8790ca
parent 86ba317d72
7 changed files with 24 additions and 16 deletions
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@ -525,12 +525,12 @@ tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,

    LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
        std::move(new_server), grpc_server->worker_env()->device_mgr,
-        grpc_server->worker_env()->collective_executor_mgr));
+        grpc_server->worker_env()->collective_executor_mgr.get()));
  } else {
    LOG_AND_RETURN_IF_ERROR(grpc_server->UpdateServerDef(server_def));
    LOG_AND_RETURN_IF_ERROR(context->StoreCollectiveOpsServer(
        /*new_server=*/nullptr, grpc_server->worker_env()->device_mgr,
-        grpc_server->worker_env()->collective_executor_mgr));
+        grpc_server->worker_env()->collective_executor_mgr.get()));
  }
  return tensorflow::Status::OK();
 #undef LOG_AND_RETURN_IF_ERROR
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@ -514,7 +514,6 @@ tf_cuda_cc_test(
    extra_copts = tfe_xla_copts(),
    tags = [
        "no_windows",
-        "noasan",  # leaks gRPC server instances
    ],
    deps = [
        ":c_api",
@ -581,7 +580,6 @@ tf_cuda_cc_test(
    extra_copts = tfe_xla_copts(),
    tags = [
        "no_windows",
-        "noasan",  # leaks gRPC server instances
    ],
    deps = [
        ":c_api",
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@ -62,7 +62,7 @@ struct WorkerCacheFactoryOptions {
 struct MasterEnv {
  Env* env = nullptr;

-  // Object from which WorkerInterface instances can be obtained.
+  // Object from which WorkerInterface instances can be obtained. Not owned.
  WorkerCacheInterface* worker_cache = nullptr;

  // The operation definitions to use.  Must be filled before use.
@ -93,7 +93,7 @@ struct MasterEnv {
      worker_cache_factory;

  // Generates per-step CollectiveExecutors and has access to utilities
-  // supporting collective operations.
+  // supporting collective operations. Not owned.
  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
 };

--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@ -267,9 +267,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
  CHECK_NE(nullptr, worker_cache);

  if (opts.collective_mgr_func) {
-    worker_env_.collective_executor_mgr =
-        opts.collective_mgr_func(config, &worker_env_, worker_cache);
-    if (!worker_env_.collective_executor_mgr) {
+    worker_env_.collective_executor_mgr.reset(
+        opts.collective_mgr_func(config, &worker_env_, worker_cache));
+    if (worker_env_.collective_executor_mgr == nullptr) {
      return errors::Internal(
          "collective_mgr_func did not return CollectiveExecutorMgr");
    }
@ -281,9 +281,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
        new CollectiveParamResolverDistributed(config, worker_env_.device_mgr,
                                               dev_resolver.get(), worker_cache,
                                               default_worker_name));
-    worker_env_.collective_executor_mgr = new RpcCollectiveExecutorMgr(
+    worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
        config, worker_env_.device_mgr, std::move(dev_resolver),
-        std::move(param_resolver), worker_cache, default_worker_name);
+        std::move(param_resolver), worker_cache, default_worker_name));
  }

  // Set up worker environment.
@ -299,7 +299,8 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
  // Finish setting up master environment.
  master_env_.ops = OpRegistry::Global();
  master_env_.worker_cache = worker_cache;
-  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  master_env_.collective_executor_mgr =
+      worker_env_.collective_executor_mgr.get();
  StatsPublisherFactory stats_factory = opts.stats_factory;
  master_env_.master_session_factory =
      [config, stats_factory](
@ -433,6 +434,8 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
    return errors::InvalidArgument(
        "Failed to build worker cache with the provided server def.");
  }
+  // Transfer ownership of worker_cache to worker_env_.session_mgr.
+  worker_env_.session_mgr->ResetDefaultWorkerCache(worker_cache);

  string default_worker_name;
  string unused;
@ -447,13 +450,14 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
      new CollectiveParamResolverDistributed(
          server_def_.default_session_config(), worker_env_.device_mgr,
          dev_resolver.get(), worker_cache, default_worker_name));
-  worker_env_.collective_executor_mgr = new RpcCollectiveExecutorMgr(
+  worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
      server_def_.default_session_config(), worker_env_.device_mgr,
      std::move(dev_resolver), std::move(param_resolver), worker_cache,
-      default_worker_name);
+      default_worker_name));

  master_env_.worker_cache = worker_cache;
-  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  master_env_.collective_executor_mgr =
+      worker_env_.collective_executor_mgr.get();
  return Status::OK();
 }

--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@ -144,6 +144,10 @@ Status SessionMgr::CreateSession(
  return Status::OK();
 }

+void SessionMgr::ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache) {
+  default_worker_cache_.reset(worker_cache);
+}
+
 Status SessionMgr::UpdateSession(
    const string& session, const ServerDef& server_def,
    const protobuf::RepeatedPtrField<DeviceAttributes>&
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@ -53,6 +53,8 @@ class SessionMgr {
      const protobuf::RepeatedPtrField<DeviceAttributes>& device_attributes,
      bool isolate_session_state);

+  void ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache);
+
  // Updates state (worker cache, devices) of worker session identified by
  // session name (`session`) based on a new server_def and set of devices.
  Status UpdateSession(const string& session, const ServerDef& server_def,
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@ -60,7 +60,7 @@ struct WorkerEnv {

  // Generates per-step CollectiveExecutors and has access to utilities
  // supporting collective operations.
-  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr;

  // A pool of threads for scheduling compute work.
  thread::ThreadPool* compute_pool = nullptr;