Use a singleton threadpool in SingleThreadedCpuDevice instead of creating one for each graph.

We currently create a new ThreadPool with a single thread each time we create a new TF_Graph, perform constant folding, perform shape inference in C++ via a ShapeRefiner, import a GraphDef, or restore an Iterator from a checkpoint. This pool is only created in case we try to run an Eigen kernel that uses intra-op parallelism, and indeed since we never attempt to parallelize when the pool has only one thread, it is idle for its entire short existence. This change turns the ThreadPool used in SingleThreadedCpuDevice into a global singleton. The cost is that we keep an idle thread around for the lifetime of the process, but we save on thread creation and destruction. Since we previously created a SingleThreadedCpuDevice (and hence a thread) at least once per graph function, this seems like a reasonable tradeoff. PiperOrigin-RevId: 236739404
2019-03-04 15:47:22 -08:00 · 2019-03-04 15:47:22 -08:00 · 3ada13e261
commit 3ada13e261
parent c5c2f03b57
4 changed files with 101 additions and 58 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -3098,6 +3098,7 @@ tf_cuda_library(
        "common_runtime/session_factory.cc",
        "common_runtime/session_options.cc",
        "common_runtime/session_state.cc",
+        "common_runtime/single_threaded_cpu_device.cc",
        "common_runtime/stats_publisher_interface.cc",
        "common_runtime/step_stats_collector.cc",
        "common_runtime/threadpool_device.cc",
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@ -92,7 +92,7 @@ class SimpleRendezvous : public Rendezvous {
 }  // namespace

 GraphRunner::GraphRunner(Env* env)
-    : device_deleter_(new SingleThreadedCpuDevice(env)),
+    : device_deleter_(NewSingleThreadedCpuDevice(env)),
      device_(device_deleter_.get()) {}
 GraphRunner::GraphRunner(Device* device) : device_(device) {}

--- a/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+namespace {
+
+static constexpr int kNumThreads = 1;
+
+thread::ThreadPool* GraphRunnerThreadPool() {
+  static thread::ThreadPool* thread_pool =
+      new thread::ThreadPool(Env::Default(), "graph_runner", kNumThreads);
+  return thread_pool;
+}
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  explicit SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = kNumThreads;
+    eigen_worker_threads_.workers = GraphRunnerThreadPool();
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace
+
+Device* NewSingleThreadedCpuDevice(Env* env) {
+  return new SingleThreadedCpuDevice(env);
+}
+
+}  // namespace tensorflow
--- a/tensorflow/core/common_runtime/single_threaded_cpu_device.h
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@ -16,67 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_

-#define EIGEN_USE_THREADS
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-
 namespace tensorflow {

+class Device;
 class Env;

-// A simple single-threaded CPU device. This can be used to run inexpensive
-// computations. In particular, using this avoids initializing the global thread
-// pools in LocalDevice.
-class SingleThreadedCpuDevice : public Device {
- public:
-  SingleThreadedCpuDevice(Env* env)
-      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
-                                                  Bytes(256 << 20),
-                                                  DeviceLocality())) {
-    eigen_worker_threads_.num_threads = 1;
-    eigen_worker_threads_.workers = new thread::ThreadPool(
-        env, "graph_runner", eigen_worker_threads_.num_threads);
-    eigen_threadpool_wrapper_.reset(
-        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
-    eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
-    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
-    set_eigen_cpu_device(eigen_device_.get());
-  }
-
-  ~SingleThreadedCpuDevice() override {
-    eigen_threadpool_wrapper_.reset();
-    eigen_device_.reset();
-    delete eigen_worker_threads_.workers;
-  }
-
-  Status Sync() override { return Status::OK(); }
-
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override {
-    Tensor parsed(tensor_proto.dtype());
-    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
-    }
-    *tensor = parsed;
-    return Status::OK();
-  }
-
-  Allocator* GetAllocator(AllocatorAttributes attr) override {
-    return cpu_allocator();
-  }
-
- private:
-  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
-  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
-  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
-};
+// Returns a simple single-threaded CPU device. This can be used to run
+// inexpensive computations. In particular, using this avoids initializing the
+// global thread pools in LocalDevice.
+//
+// The returned pointer is owned by the caller.
+Device* NewSingleThreadedCpuDevice(Env* env);

 }  // namespace tensorflow