Fix BaseGPUDevice, let it report the actual memory limit of the allocator. Also added a helper method to reset ProcessState.

PiperOrigin-RevId: 186655996
2018-02-22 12:24:22 -08:00 · 2018-02-22 12:24:22 -08:00 · 2e707494c4
commit 2e707494c4
parent 51b3348751
1 changed files with 21 additions and 8 deletions
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -1013,21 +1013,34 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
  int numa_node = dev_locality.numa_node();
-  Bytes allocated_bytes = static_cast<Bytes>(memory_limit);

  gpu::StreamExecutor* se =
      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
  const gpu::DeviceDescription& desc = se->GetDeviceDescription();
-  LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
-            << (memory_limit >> 20) << " MB memory) -> physical GPU ("
-            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
  ProcessState* process_state = ProcessState::singleton();
+  Allocator* gpu_allocator = process_state->GetGPUAllocator(
+      options.config.gpu_options(), tf_gpu_id, memory_limit);
+  if (gpu_allocator == nullptr) {
+    return errors::Internal("Failed to get memory allocator for TF GPU ",
+                            tf_gpu_id.value(), " with ", memory_limit,
+                            " bytes of memory.");
+  }
+  AllocatorStats stats;
+  gpu_allocator->GetStats(&stats);
+  // 'memory_limit' is the required memory size, but if the allocator with given
+  // tf_gpu_id was created before, we'll use it instead of creating a new one
+  // (as TF gpu device is a shared resource), in which case the actual memory
+  // limit represented by 'stats.bytes_limit' used by that allocator may be
+  // different (which should be an error).
+  //
+  // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
  BaseGPUDevice* gpu_device = CreateGPUDevice(
-      options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
-      GetShortDeviceDescription(cuda_gpu_id, desc),
-      process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
-                                     memory_limit),
+      options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
+      tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator,
      process_state->GetCPUAllocator(numa_node));
+  LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
+            << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
+            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
  TF_RETURN_IF_ERROR(gpu_device->Init(options));
  devices->push_back(gpu_device);