From 450a690cfe0d7c57464110734cebfe7a5b239e0d Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Fri, 5 Jul 2019 03:26:45 -0700 Subject: [PATCH] Replacing GetCudaLaunchConfig and CudaLaunchKernel with their Gpu equivalent. PiperOrigin-RevId: 256648520 --- .../core/kernels/rnn/lstm_ops_gpu.cu.cc | 12 +-- .../core/util/gpu_kernel_helper_test.cu.cc | 89 +++++++++---------- .../adding_an_op/cuda_op_kernel.cu.cc | 4 +- .../builds/user_ops/cuda_op_kernel.cu.cc | 4 +- 4 files changed, 53 insertions(+), 56 deletions(-) diff --git a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc index 710b0eb7c8d..4101ee8ed2f 100644 --- a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc @@ -241,9 +241,9 @@ void LSTMBlockCellFpropWithCUDA( const int block_dim = 128; const int grid_dim = Eigen::divup(batch_size * (cell_size + input_size), block_dim); - TF_CHECK_OK(CudaLaunchKernel(concat_xh, grid_dim, block_dim, 0, cu_stream, - xh.data(), x.data(), h_prev.data(), batch_size, - cell_size, input_size)); + TF_CHECK_OK(GpuLaunchKernel(concat_xh, grid_dim, block_dim, 0, cu_stream, + xh.data(), x.data(), h_prev.data(), batch_size, + cell_size, input_size)); // states1 = xh * w typename TTypes::ConstMatrix const_xh(xh.data(), xh.dimensions()); @@ -261,13 +261,13 @@ void LSTMBlockCellFpropWithCUDA( Eigen::divup(cell_size, static_cast(block_dim_2d.y))); if (use_peephole) { - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( lstm_gates, grid_dim_2d, block_dim_2d, 0, cu_stream, icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(), i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size)); } else { - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( lstm_gates, grid_dim_2d, block_dim_2d, 0, cu_stream, icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(), wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(), @@ -376,7 +376,7 @@ void LSTMBlockCellBpropWithCUDA( dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast(block_dim_2d.x)), Eigen::divup(cell_size, static_cast(block_dim_2d.y))); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( lstm_gates_bprop, grid_dim_2d, block_dim_2d, 0, cu_stream, cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(), wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(), diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc index 1633f9e8907..528bc559a20 100644 --- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc @@ -155,24 +155,24 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) { // test valid inputs #define TEST_LAUNCH_PARAMETER(work_element_count) \ cfg = GetGpuLaunchConfig(bufsize, d); \ - TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - outbuf)); \ + TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ + cfg.thread_per_block, 0, d.stream(), cfg, \ + outbuf)); \ CUDA_ASSERT_SUCCESS \ cfg = GetGpuLaunchConfig(work_element_count, d); \ - TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ - 0, d.stream(), cfg, bufsize, outbuf)); \ + TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ CUDA_EXPECT_SUCCESS \ EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \ \ cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - outbuf)); \ + TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \ + cfg.thread_per_block, 0, d.stream(), cfg, \ + outbuf)); \ CUDA_ASSERT_SUCCESS \ cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \ - TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ - 0, d.stream(), cfg, bufsize, outbuf)); \ + TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ CUDA_EXPECT_SUCCESS \ EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)) @@ -206,29 +206,27 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) { GpuLaunchConfig cfg1d; // test valid inputs -#define TEST_LAUNCH_PARAMETER(dimx, dimy) \ - cfg1d = GetGpuLaunchConfig(bufsize, d); \ - TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ - cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ - outbuf)); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \ - TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - bufsize, outbuf)); \ - CUDA_EXPECT_SUCCESS \ - EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ - \ - cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ - cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ - outbuf)); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ - TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - bufsize, outbuf)); \ - CUDA_EXPECT_SUCCESS \ +#define TEST_LAUNCH_PARAMETER(dimx, dimy) \ + cfg1d = GetGpuLaunchConfig(bufsize, d); \ + TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \ + cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ + outbuf)); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \ + TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ + CUDA_EXPECT_SUCCESS \ + EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \ + \ + cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \ + cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ + outbuf)); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ + TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ + CUDA_EXPECT_SUCCESS \ EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128); @@ -250,17 +248,16 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) { GpuLaunchConfig cfg1d; // test valid inputs -#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \ - cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ - TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \ - cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ - outbuf)); \ - CUDA_ASSERT_SUCCESS \ - cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ - TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \ - cfg.thread_per_block, 0, d.stream(), cfg, \ - bufsize, outbuf)); \ - CUDA_EXPECT_SUCCESS \ +#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \ + cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \ + TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \ + cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ + outbuf)); \ + CUDA_ASSERT_SUCCESS \ + cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ + TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \ + 0, d.stream(), cfg, bufsize, outbuf)); \ + CUDA_EXPECT_SUCCESS \ EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0)) TEST_LAUNCH_PARAMETER(128, 128, 128); @@ -282,8 +279,8 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) { unsigned* failure_count; ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess); *failure_count = 0; - TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr, - failure_count)); + TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr, + failure_count)); ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess); ASSERT_EQ(*failure_count, 0); cudaFree(failure_count); diff --git a/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc index 1dcf23e4d03..a9d66f9850e 100644 --- a/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc +++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc @@ -27,8 +27,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) { } void AddOneKernelLauncher(const int* in, const int N, int* out) { - TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr, - in, N, out)); + TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr, + in, N, out)); } #endif diff --git a/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc index 53c9962246c..052788ed6dc 100644 --- a/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc +++ b/tensorflow/tools/ci_build/builds/user_ops/cuda_op_kernel.cu.cc @@ -26,8 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) { } void AddOneKernelLauncher(const int* in, const int N, int* out) { - TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr, - in, N, out)); + TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr, + in, N, out)); } #endif