Replacing GetCudaLaunchConfig and CudaLaunchKernel with their Gpu equivalent.

PiperOrigin-RevId: 256648520
This commit is contained in:
Christian Sigg 2019-07-05 03:26:45 -07:00 committed by TensorFlower Gardener
parent 2e00e03972
commit 450a690cfe
4 changed files with 53 additions and 56 deletions

View File

@ -241,9 +241,9 @@ void LSTMBlockCellFpropWithCUDA(
const int block_dim = 128;
const int grid_dim =
Eigen::divup(batch_size * (cell_size + input_size), block_dim);
TF_CHECK_OK(CudaLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
xh.data(), x.data(), h_prev.data(), batch_size,
cell_size, input_size));
TF_CHECK_OK(GpuLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
xh.data(), x.data(), h_prev.data(), batch_size,
cell_size, input_size));
// states1 = xh * w
typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
@ -261,13 +261,13 @@ void LSTMBlockCellFpropWithCUDA(
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
if (use_peephole) {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
} else {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
@ -376,7 +376,7 @@ void LSTMBlockCellBpropWithCUDA(
dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),

View File

@ -155,24 +155,24 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
// test valid inputs
#define TEST_LAUNCH_PARAMETER(work_element_count) \
cfg = GetGpuLaunchConfig(bufsize, d); \
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
outbuf)); \
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpuLaunchConfig(work_element_count, d); \
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
\
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
outbuf)); \
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
@ -206,29 +206,27 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
GpuLaunchConfig cfg1d;
// test valid inputs
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
cfg1d = GetGpuLaunchConfig(bufsize, d); \
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
\
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
cfg1d = GetGpuLaunchConfig(bufsize, d); \
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
\
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
TEST_LAUNCH_PARAMETER(128, 128);
@ -250,17 +248,16 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
GpuLaunchConfig cfg1d;
// test valid inputs
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
cfg.thread_per_block, 0, d.stream(), cfg, \
bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
outbuf)); \
CUDA_ASSERT_SUCCESS \
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
0, d.stream(), cfg, bufsize, outbuf)); \
CUDA_EXPECT_SUCCESS \
EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
TEST_LAUNCH_PARAMETER(128, 128, 128);
@ -282,8 +279,8 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
unsigned* failure_count;
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
*failure_count = 0;
TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
failure_count));
TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
failure_count));
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
ASSERT_EQ(*failure_count, 0);
cudaFree(failure_count);

View File

@ -27,8 +27,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
}
void AddOneKernelLauncher(const int* in, const int N, int* out) {
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
in, N, out));
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
in, N, out));
}
#endif

View File

@ -26,8 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
}
void AddOneKernelLauncher(const int* in, const int N, int* out) {
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
in, N, out));
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
in, N, out));
}
#endif