Add a GPUOPTIONS option to force all tensors to be gpu_compatible

Change: 153386455
2017-04-17 13:11:09 -08:00 · 2017-04-17 13:11:09 -08:00 · 94a2da3905
commit 94a2da3905
parent af36579b63
3 changed files with 37 additions and 5 deletions
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@ -31,12 +31,17 @@ class GPUDevice : public BaseGPUDevice {
            Allocator* cpu_allocator)
      : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
                      physical_device_desc, gpu_allocator, cpu_allocator,
-                      false /* sync every op */, 1 /* max_streams */) {}
+                      false /* sync every op */, 1 /* max_streams */) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }

  Allocator* GetAllocator(AllocatorAttributes attr) override {
    if (attr.on_host()) {
-      ProcessState* ps = ProcessState::singleton();
-      if (attr.gpu_compatible()) {
+      if (attr.gpu_compatible() || force_gpu_compatible_) {
+        ProcessState* ps = ProcessState::singleton();
        return ps->GetCUDAHostAllocator(0);
      } else {
        return cpu_allocator_;
@ -45,6 +50,9 @@ class GPUDevice : public BaseGPUDevice {
      return gpu_allocator_;
    }
  }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };

 class GPUDeviceFactory : public BaseGPUDeviceFactory {
@ -71,18 +79,26 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
  GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
                         Bytes memory_limit, const DeviceLocality& locality,
                         Allocator* allocator)
-      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
+      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }
  ~GPUCompatibleCPUDevice() override {}

  Allocator* GetAllocator(AllocatorAttributes attr) override {
    ProcessState* ps = ProcessState::singleton();
-    if (attr.gpu_compatible()) {
+    if (attr.gpu_compatible() || force_gpu_compatible_) {
      return ps->GetCUDAHostAllocator(0);
    } else {
      // Call the parent's implementation.
      return ThreadPoolDevice::GetAllocator(attr);
    }
  }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };

 // The associated factory.
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@ -64,6 +64,18 @@ message GPUOptions {
  // PollEvents calls, when the queue is empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_inactive_delay_msecs = 7;
+
+  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
+  // enabling this option forces all CPU tensors to be allocated with Cuda
+  // pinned memory. Normally, TensorFlow will infer which tensors should be
+  // allocated as the pinned memory. But in case where the inference is
+  // incomplete, this option can significantly speed up the cross-device memory
+  // copy performance as long as it fits the memory.
+  // Note that this option is not something that should be
+  // enabled by default for unknown or very large models, since all Cuda pinned
+  // memory is unpageable, having too much pinned memory might negatively impact
+  // the overall host system performance.
+  bool force_gpu_compatible = 8;
 };

 // Options passed to the graph optimizer
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@ -22,6 +22,10 @@ tf_class {
    name: "Extensions"
    mtype: "<type \'getset_descriptor\'>"
  }
+  member {
+    name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
  member {
    name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
    mtype: "<type \'int\'>"