Fix BaseGPUDevice, let it report the actual memory limit of the allocator. Also added a helper method to reset ProcessState.

PiperOrigin-RevId: 186655996
This commit is contained in:
Guangda Lai 2018-02-22 12:24:22 -08:00 committed by TensorFlower Gardener
parent 51b3348751
commit 2e707494c4

View File

@ -1013,21 +1013,34 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
int numa_node = dev_locality.numa_node();
Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
gpu::StreamExecutor* se =
GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
const gpu::DeviceDescription& desc = se->GetDeviceDescription();
LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
<< (memory_limit >> 20) << " MB memory) -> physical GPU ("
<< GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
ProcessState* process_state = ProcessState::singleton();
Allocator* gpu_allocator = process_state->GetGPUAllocator(
options.config.gpu_options(), tf_gpu_id, memory_limit);
if (gpu_allocator == nullptr) {
return errors::Internal("Failed to get memory allocator for TF GPU ",
tf_gpu_id.value(), " with ", memory_limit,
" bytes of memory.");
}
AllocatorStats stats;
gpu_allocator->GetStats(&stats);
// 'memory_limit' is the required memory size, but if the allocator with given
// tf_gpu_id was created before, we'll use it instead of creating a new one
// (as TF gpu device is a shared resource), in which case the actual memory
// limit represented by 'stats.bytes_limit' used by that allocator may be
// different (which should be an error).
//
// TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
BaseGPUDevice* gpu_device = CreateGPUDevice(
options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
GetShortDeviceDescription(cuda_gpu_id, desc),
process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
memory_limit),
options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator,
process_state->GetCPUAllocator(numa_node));
LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
<< (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
<< GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
TF_RETURN_IF_ERROR(gpu_device->Init(options));
devices->push_back(gpu_device);