From c8a2f59e2e2775808b82c877433ead9545fd7a84 Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Wed, 22 Jan 2020 12:53:00 -0800 Subject: [PATCH] Fixing and enabling //tensorflow/core/util:gpu_kernel_helper_test_gpu --- tensorflow/core/util/gpu_device_functions.h | 4 + .../core/util/gpu_kernel_helper_test.cu.cc | 127 +++++++++++------- 2 files changed, 84 insertions(+), 47 deletions(-) diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h index 61d1e3c9453..083d42b7de4 100644 --- a/tensorflow/core/util/gpu_device_functions.h +++ b/tensorflow/core/util/gpu_device_functions.h @@ -53,6 +53,8 @@ using gpuEvent_t = cudaEvent_t; #define gpuEventCreate cudaEventCreate #define gpuEventCreateWithFlags cudaEventCreateWithFlags #define gpuEventDisableTiming cudaEventDisableTiming +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuFree cudaFree #elif TENSORFLOW_USE_ROCM using gpuFloatComplex = hipFloatComplex; using gpuDoubleComplex = hipDoubleComplex; @@ -68,6 +70,8 @@ using cudaError_t = int; #define gpuEventCreate hipEventCreate #define gpuEventCreateWithFlags hipEventCreateWithFlags #define gpuEventDisableTiming hipEventDisableTiming +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuFree hipFree static std::string cudaGetErrorString(int err) { return std::to_string(err); } #endif diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc index c089511e964..3135d25d1b8 100644 --- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU +#include #include #include "tensorflow/core/lib/core/status_test_util.h" @@ -25,14 +26,14 @@ limitations under the License. #define CUDA_EXPECT_SUCCESS \ { \ - cudaDeviceSynchronize(); \ + gpuDeviceSynchronize(); \ cudaError_t err = cudaGetLastError(); \ EXPECT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \ } #define CUDA_ASSERT_SUCCESS \ { \ - cudaDeviceSynchronize(); \ + gpuDeviceSynchronize(); \ cudaError_t err = cudaGetLastError(); \ ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); \ } @@ -94,8 +95,7 @@ __global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, } } -__global__ void CudaShuffleGetSrcLaneTest( - unsigned* __restrict__ failure_count) { +__global__ void GpuShuffleGetSrcLaneTest(unsigned* __restrict__ failure_count) { unsigned lane_id = GpuLaneId(); for (int width = warpSize; width > 1; width /= 2) { auto check_result = [&](const char* op_name, int param, unsigned actual, @@ -103,31 +103,38 @@ __global__ void CudaShuffleGetSrcLaneTest( if (actual != expected) { printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n", op_name, param, width, lane_id, actual, expected); - CudaAtomicAdd(failure_count, 1); + GpuAtomicAdd(failure_count, 1); } }; + for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) { - unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width); +#if TENSORFLOW_USE_ROCM + if (src_lane < 0 || src_lane >= width) continue; +#endif + unsigned actual_lane = detail::GpuShuffleGetSrcLane(src_lane, width); unsigned expect_lane = - CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width); + GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width); check_result("Shuffle", src_lane, actual_lane, expect_lane); } + for (unsigned delta = 0; delta <= warpSize; ++delta) { - unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width); + unsigned actual_lane = detail::GpuShuffleUpGetSrcLane(delta, width); unsigned expect_lane = - CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width); + GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width); check_result("ShuffleUp", delta, actual_lane, expect_lane); } + for (unsigned delta = 0; delta <= warpSize; ++delta) { - unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width); + unsigned actual_lane = detail::GpuShuffleDownGetSrcLane(delta, width); unsigned expect_lane = - CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width); + GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width); check_result("ShuffleDown", delta, actual_lane, expect_lane); } + for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) { - unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width); + unsigned actual_lane = detail::GpuShuffleXorGetSrcLane(lane_lane, width); unsigned expect_lane = - CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width); + GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width); check_result("ShuffleXor", lane_lane, actual_lane, expect_lane); } } @@ -137,19 +144,32 @@ __global__ void CudaShuffleGetSrcLaneTest( class GpuLaunchConfigTest : public ::testing::Test { protected: - const int bufsize = 1024; + static const int bufsize = 1024; int* outbuf = nullptr; + int* outbuf_host = nullptr; + int hostbuf[bufsize]; Eigen::GpuStreamDevice stream; Eigen::GpuDevice d = Eigen::GpuDevice(&stream); + void copyToHost() { +#if TENSORFLOW_USE_ROCM + hipMemcpy(hostbuf, outbuf, sizeof(int) * bufsize, hipMemcpyDeviceToHost); +#endif + } virtual void SetUp() { +#if GOOGLE_CUDA cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize); + outbuf_host = outbuf; +#else + cudaError_t err = hipMalloc(&outbuf, sizeof(int) * bufsize); + outbuf_host = hostbuf; +#endif ASSERT_EQ(cudaSuccess, err) << cudaGetErrorString(err); } virtual void TearDown() { - cudaDeviceSynchronize(); - cudaFree(outbuf); + gpuDeviceSynchronize(); + gpuFree(outbuf); outbuf = nullptr; } }; @@ -158,28 +178,32 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) { GpuLaunchConfig cfg; // test valid inputs -#define TEST_LAUNCH_PARAMETER(work_element_count) \ - cfg = GetGpuLaunchConfig(bufsize, d); \ - TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - outbuf)); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetGpuLaunchConfig(work_element_count, d); \ - TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ - 0, d.stream(), cfg, bufsize, outbuf)); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \ - \ - cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - outbuf)); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \ - TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ - 0, d.stream(), cfg, bufsize, outbuf)); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)) +#define TEST_LAUNCH_PARAMETER(work_element_count) \ + cfg = GetGpuLaunchConfig(bufsize, d); \ + TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ + cfg.thread_per_block, 0, d.stream(), cfg, \ + outbuf)); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetGpuLaunchConfig(work_element_count, d); \ + TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ + CUDA_EXPECT_SUCCESS \ + copyToHost(); \ + EXPECT_EQ(work_element_count, \ + std::accumulate(outbuf_host, outbuf_host + bufsize, 0)); \ + \ + cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ + cfg.thread_per_block, 0, d.stream(), cfg, \ + outbuf)); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \ + TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ + CUDA_EXPECT_SUCCESS \ + copyToHost(); \ + EXPECT_EQ(work_element_count, \ + std::accumulate(outbuf_host, outbuf_host + bufsize, 0)); TEST_LAUNCH_PARAMETER(128); TEST_LAUNCH_PARAMETER(129); @@ -221,7 +245,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) { TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \ 0, d.stream(), cfg, bufsize, outbuf)); \ CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ + copyToHost(); \ + EXPECT_EQ(dimx* dimy, \ + std::accumulate(outbuf_host, outbuf_host + bufsize, 0)); \ \ cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \ @@ -232,7 +258,8 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) { TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \ 0, d.stream(), cfg, bufsize, outbuf)); \ CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)) + copyToHost(); \ + EXPECT_EQ(dimx* dimy, std::accumulate(outbuf_host, outbuf_host + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128); TEST_LAUNCH_PARAMETER(129, 64); @@ -263,7 +290,9 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) { TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \ 0, d.stream(), cfg, bufsize, outbuf)); \ CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0)) + copyToHost(); \ + EXPECT_EQ(dimx* dimy* dimz, \ + std::accumulate(outbuf_host, outbuf_host + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128, 128); TEST_LAUNCH_PARAMETER(129, 64, 1024); @@ -282,15 +311,19 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) { TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) { unsigned* failure_count; +#if GOOGLE_CUDA ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess); +#else + ASSERT_EQ(hipHostMalloc(&failure_count, sizeof(unsigned), 0), cudaSuccess); +#endif *failure_count = 0; - TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr, - failure_count)); - ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); + TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0, + nullptr, failure_count)); + ASSERT_EQ(gpuDeviceSynchronize(), cudaSuccess); ASSERT_EQ(*failure_count, 0); - cudaFree(failure_count); + gpuFree(failure_count); } } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM