Remove device memory check, since it's incorrect when the pointer is pointing to pinned host memory. Also, memcpy would fail if the pointer is invalid, so we don't need an additional check.

Added a test for pinned host memory.

PiperOrigin-RevId: 282036798
Change-Id: I6c0aab79a0e1ec1df9e2010e461d2ad8af8a1703
This commit is contained in:
Guangda Lai 2019-11-22 14:02:08 -08:00 committed by TensorFlower Gardener
parent 7499cc4974
commit e840aa5e28
3 changed files with 54 additions and 62 deletions

View File

@ -129,6 +129,17 @@ cc_library(
],
)
tf_cuda_cc_test(
name = "memcpy_test",
srcs = ["memcpy_test.cc"],
tags = tf_cuda_tests_tags(),
deps = [
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/stream_executor/lib",
],
)
cc_library(
name = "cudart_stub",
srcs = select({

View File

@ -180,44 +180,6 @@ string MemorySpaceString(MemorySpace memory_space) {
namespace {
bool IsPointerCheckDisabled() {
// We want to check pointers for validity normally, but the
// cudaPointerGetAttributes call actually returns an error if it is given a
// host pointer. This confuses tools like cuda-memcheck and cuda-gdb.
//
// TF_DISABLE_GPU_POINTER_CHECKS gives us an escape hatch for reducing logspam
// when using cuda-memcheck and cuda-gdb.
return std::getenv("TF_DISABLE_GPU_POINTER_CHECKS") != nullptr;
}
// Checks that the pointer is to a location on the device it purports to be.
// PtrT is one of CUdeviceptr or void*. If it's a CUdeviceptr, then
// cudaPointerGetAttributes should not fail, and return a memoryType of
// cudaMemoryTypeDevice.
template <typename PtrT>
void CheckPointerIsValid(const PtrT ptr, absl::string_view name) {
static bool pointer_check_disabled = IsPointerCheckDisabled();
if (pointer_check_disabled) {
return;
}
bool is_host_ptr = !std::is_same<PtrT, CUdeviceptr>::value;
cudaPointerAttributes attributes;
cudaError_t err =
cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
CHECK(err == cudaSuccess || err == cudaErrorInvalidValue)
<< "Unexpected CUDA error: " << cudaGetErrorString(err);
// If we failed, reset cuda error status to avoid poisoning cuda streams.
if (err != cudaSuccess) cudaGetLastError();
bool points_to_host_memory = (err == cudaErrorInvalidValue ||
attributes.memoryType != cudaMemoryTypeDevice);
CHECK_EQ(is_host_ptr, points_to_host_memory) << absl::StreamFormat(
"%s pointer is not actually on %s: %p", name, is_host_ptr ? "CPU" : "GPU",
reinterpret_cast<const void*>(ptr));
}
// Call cuCtxtSynchronize and crash if it doesn't succeed.
void SynchronizeOrDie() {
FAIL_IF_CUDA_RES_ERROR(cuCtxSynchronize(),
@ -1011,10 +973,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(gpu_src, "src");
CheckPointerIsValid(host_dst, "dst");
}
RETURN_IF_CUDA_RES_ERROR(
cuMemcpyDtoH(host_dst, gpu_src, size),
absl::StrFormat("failed to synchronous memcpy from device to host "
@ -1030,10 +988,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
const void* host_src,
uint64 size) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(host_src, "src");
CheckPointerIsValid(gpu_dst, "dst");
}
RETURN_IF_CUDA_RES_ERROR(
cuMemcpyHtoD(gpu_dst, host_src, size),
absl::StrFormat(
@ -1049,10 +1003,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(gpu_src, "src");
CheckPointerIsValid(gpu_dst, "dst");
}
RETURN_IF_CUDA_RES_ERROR(
cuMemcpyDtoD(gpu_dst, gpu_src, size),
absl::StrFormat(
@ -1070,10 +1020,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(gpu_src, "src");
CheckPointerIsValid(host_dst, "dst");
}
CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << absl::StrFormat(
@ -1094,10 +1040,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(host_src, "src");
CheckPointerIsValid(gpu_dst, "dst");
}
CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << absl::StrFormat(
@ -1117,10 +1059,6 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
if (size > 0) {
CheckPointerIsValid(gpu_src, "src");
CheckPointerIsValid(gpu_dst, "dst");
}
CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
if (result != CUDA_SUCCESS) {
LOG(ERROR) << absl::StrFormat(

View File

@ -0,0 +1,43 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#include "absl/memory/memory.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/stream_executor/device_memory.h"
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/stream_executor/stream_executor.h"
namespace stream_executor {
TEST(MemcpyTest, PinnedHostMemory) {
Platform* platform =
MultiPlatformManager::PlatformWithName("CUDA").ValueOrDie();
StreamExecutor* executor = platform->ExecutorForDevice(0).ValueOrDie();
Stream stream(executor);
stream.Init();
ASSERT_TRUE(stream.ok());
void* d_ptr = executor->HostMemoryAllocate(sizeof(int));
DeviceMemoryBase d_mem(d_ptr, sizeof(int));
int h_ptr;
stream.ThenMemcpy(&h_ptr, d_mem, d_mem.size());
EXPECT_TRUE(stream.BlockHostUntilDone().ok());
}
} // namespace stream_executor
#endif // GOOGLE_CUDA