Add a GPUOPTIONS option to force all tensors to be gpu_compatible
Change: 153386455
This commit is contained in:
parent
af36579b63
commit
94a2da3905
tensorflow
core
tools/api/golden
@ -31,12 +31,17 @@ class GPUDevice : public BaseGPUDevice {
|
||||
Allocator* cpu_allocator)
|
||||
: BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
|
||||
physical_device_desc, gpu_allocator, cpu_allocator,
|
||||
false /* sync every op */, 1 /* max_streams */) {}
|
||||
false /* sync every op */, 1 /* max_streams */) {
|
||||
if (options.config.has_gpu_options()) {
|
||||
force_gpu_compatible_ =
|
||||
options.config.gpu_options().force_gpu_compatible();
|
||||
}
|
||||
}
|
||||
|
||||
Allocator* GetAllocator(AllocatorAttributes attr) override {
|
||||
if (attr.on_host()) {
|
||||
ProcessState* ps = ProcessState::singleton();
|
||||
if (attr.gpu_compatible()) {
|
||||
if (attr.gpu_compatible() || force_gpu_compatible_) {
|
||||
ProcessState* ps = ProcessState::singleton();
|
||||
return ps->GetCUDAHostAllocator(0);
|
||||
} else {
|
||||
return cpu_allocator_;
|
||||
@ -45,6 +50,9 @@ class GPUDevice : public BaseGPUDevice {
|
||||
return gpu_allocator_;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool force_gpu_compatible_ = false;
|
||||
};
|
||||
|
||||
class GPUDeviceFactory : public BaseGPUDeviceFactory {
|
||||
@ -71,18 +79,26 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
|
||||
GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
|
||||
Bytes memory_limit, const DeviceLocality& locality,
|
||||
Allocator* allocator)
|
||||
: ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
|
||||
: ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
|
||||
if (options.config.has_gpu_options()) {
|
||||
force_gpu_compatible_ =
|
||||
options.config.gpu_options().force_gpu_compatible();
|
||||
}
|
||||
}
|
||||
~GPUCompatibleCPUDevice() override {}
|
||||
|
||||
Allocator* GetAllocator(AllocatorAttributes attr) override {
|
||||
ProcessState* ps = ProcessState::singleton();
|
||||
if (attr.gpu_compatible()) {
|
||||
if (attr.gpu_compatible() || force_gpu_compatible_) {
|
||||
return ps->GetCUDAHostAllocator(0);
|
||||
} else {
|
||||
// Call the parent's implementation.
|
||||
return ThreadPoolDevice::GetAllocator(attr);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
bool force_gpu_compatible_ = false;
|
||||
};
|
||||
|
||||
// The associated factory.
|
||||
|
@ -64,6 +64,18 @@ message GPUOptions {
|
||||
// PollEvents calls, when the queue is empty. If value is not
|
||||
// set or set to 0, gets set to a non-zero default.
|
||||
int32 polling_inactive_delay_msecs = 7;
|
||||
|
||||
// Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
|
||||
// enabling this option forces all CPU tensors to be allocated with Cuda
|
||||
// pinned memory. Normally, TensorFlow will infer which tensors should be
|
||||
// allocated as the pinned memory. But in case where the inference is
|
||||
// incomplete, this option can significantly speed up the cross-device memory
|
||||
// copy performance as long as it fits the memory.
|
||||
// Note that this option is not something that should be
|
||||
// enabled by default for unknown or very large models, since all Cuda pinned
|
||||
// memory is unpageable, having too much pinned memory might negatively impact
|
||||
// the overall host system performance.
|
||||
bool force_gpu_compatible = 8;
|
||||
};
|
||||
|
||||
// Options passed to the graph optimizer
|
||||
|
@ -22,6 +22,10 @@ tf_class {
|
||||
name: "Extensions"
|
||||
mtype: "<type \'getset_descriptor\'>"
|
||||
}
|
||||
member {
|
||||
name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
|
||||
mtype: "<type \'int\'>"
|
||||
}
|
||||
member {
|
||||
name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
|
||||
mtype: "<type \'int\'>"
|
||||
|
Loading…
Reference in New Issue
Block a user