Disable tensor tracking when only one GPU stream is used.
Change: 113579306
This commit is contained in:
parent
45bb3fdcab
commit
d821f6aeb6
@ -207,6 +207,13 @@ BaseGPUDevice::~BaseGPUDevice() {
|
|||||||
gtl::STLDeleteElements(&streams_);
|
gtl::STLDeleteElements(&streams_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
|
||||||
|
// When there is no more than one stream, we release the tensor reference
|
||||||
|
// at the end of the kernel launch, instead of at the end of the kernel
|
||||||
|
// execution.
|
||||||
|
return streams_.size() > 1;
|
||||||
|
}
|
||||||
|
|
||||||
Status BaseGPUDevice::FillContextMap(const Graph* graph,
|
Status BaseGPUDevice::FillContextMap(const Graph* graph,
|
||||||
DeviceContextMap* device_context_map) {
|
DeviceContextMap* device_context_map) {
|
||||||
VLOG(2) << "FillContextMap";
|
VLOG(2) << "FillContextMap";
|
||||||
|
@ -51,7 +51,7 @@ class BaseGPUDevice : public LocalDevice {
|
|||||||
// GPU devices require the Op Compute method to save a reference to
|
// GPU devices require the Op Compute method to save a reference to
|
||||||
// any temporary tensors that are allocated until the Op execution
|
// any temporary tensors that are allocated until the Op execution
|
||||||
// completes.
|
// completes.
|
||||||
bool RequiresRecordingAccessedTensors() const override { return true; }
|
bool RequiresRecordingAccessedTensors() const override;
|
||||||
|
|
||||||
void ConsumeListOfAccessedTensors(
|
void ConsumeListOfAccessedTensors(
|
||||||
DeviceContext* device_context,
|
DeviceContext* device_context,
|
||||||
|
@ -173,9 +173,9 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
|
|||||||
const Tensor* input, Tensor* output,
|
const Tensor* input, Tensor* output,
|
||||||
StatusCallback done) {
|
StatusCallback done) {
|
||||||
const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
|
const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
|
||||||
gpu::Stream* stream = nullptr;
|
gpu::Stream* send_stream = nullptr;
|
||||||
Status s =
|
Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
|
||||||
PrepareCopy(src, send_dev_context, *input, output, &dev_info, &stream);
|
&send_stream);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
done(s);
|
done(s);
|
||||||
return;
|
return;
|
||||||
@ -187,20 +187,33 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
|
|||||||
DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
|
DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
|
||||||
void* dst_ptr = GetBase(output);
|
void* dst_ptr = GetBase(output);
|
||||||
DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
|
DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
|
||||||
|
// Since we want to use the memory from recv_stream in the send_stream,
|
||||||
|
// add a dependency to make sure the memory is truely free.
|
||||||
|
// TODO(zhengxq): remove this dependency when we switch to a better way
|
||||||
|
// to make sure the memory is free.
|
||||||
|
auto recv_stream =
|
||||||
|
static_cast<const GPUDeviceContext*>(recv_dev_context)->stream();
|
||||||
|
if (recv_stream == nullptr) {
|
||||||
|
done(errors::Internal("No recv gpu stream is available."));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
send_stream->ThenWaitFor(recv_stream);
|
||||||
|
|
||||||
VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
|
VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
|
||||||
stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
|
send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use of input may outlive stack scope, so keep a ref.
|
// Use of input may outlive stack scope, so keep a ref.
|
||||||
TensorReference input_ref(*input);
|
TensorReference input_ref(*input);
|
||||||
dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
|
dev_info->event_mgr->ThenExecute(send_stream,
|
||||||
input_ref.Unref();
|
[done, send_stream, input_ref]() {
|
||||||
if (!stream->ok()) {
|
input_ref.Unref();
|
||||||
LOG(FATAL) << "GPU->GPU Memcpy failed";
|
if (!send_stream->ok()) {
|
||||||
}
|
LOG(FATAL) << "GPU->GPU Memcpy failed";
|
||||||
done(Status::OK());
|
}
|
||||||
});
|
done(Status::OK());
|
||||||
send_dev_context->MaintainLifetimeOnStream(input, stream);
|
});
|
||||||
|
send_dev_context->MaintainLifetimeOnStream(input, send_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static CopyTensor::Registration register_gpu_gpu_copy(
|
static CopyTensor::Registration register_gpu_gpu_copy(
|
||||||
|
@ -67,6 +67,9 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
|
|||||||
perftools::gputools::DeviceMemory<uint8>>(
|
perftools::gputools::DeviceMemory<uint8>>(
|
||||||
AsDeviceMemory<uint8>(nullptr, 0));
|
AsDeviceMemory<uint8>(nullptr, 0));
|
||||||
}
|
}
|
||||||
|
// Hold the reference of the allocated tensors until the end of the
|
||||||
|
// allocator.
|
||||||
|
allocated_tensors_.push_back(temporary_memory);
|
||||||
return perftools::gputools::port::StatusOr<
|
return perftools::gputools::port::StatusOr<
|
||||||
perftools::gputools::DeviceMemory<uint8>>(
|
perftools::gputools::DeviceMemory<uint8>>(
|
||||||
AsDeviceMemory(temporary_memory.flat<uint8>().data(),
|
AsDeviceMemory(temporary_memory.flat<uint8>().data(),
|
||||||
@ -76,6 +79,7 @@ class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
|
|||||||
private:
|
private:
|
||||||
int64 memory_limit_;
|
int64 memory_limit_;
|
||||||
OpKernelContext* context_;
|
OpKernelContext* context_;
|
||||||
|
std::vector<Tensor> allocated_tensors_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
Loading…
x
Reference in New Issue
Block a user