Replacing GetCudaLaunchConfig and CudaLaunchKernel with their Gpu equivalent.
PiperOrigin-RevId: 256648520
This commit is contained in:
parent
2e00e03972
commit
450a690cfe
@ -241,9 +241,9 @@ void LSTMBlockCellFpropWithCUDA(
|
||||
const int block_dim = 128;
|
||||
const int grid_dim =
|
||||
Eigen::divup(batch_size * (cell_size + input_size), block_dim);
|
||||
TF_CHECK_OK(CudaLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
|
||||
xh.data(), x.data(), h_prev.data(), batch_size,
|
||||
cell_size, input_size));
|
||||
TF_CHECK_OK(GpuLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
|
||||
xh.data(), x.data(), h_prev.data(), batch_size,
|
||||
cell_size, input_size));
|
||||
|
||||
// states1 = xh * w
|
||||
typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
|
||||
@ -261,13 +261,13 @@ void LSTMBlockCellFpropWithCUDA(
|
||||
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
||||
|
||||
if (use_peephole) {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
||||
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
||||
i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
|
||||
} else {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
||||
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
||||
@ -376,7 +376,7 @@ void LSTMBlockCellBpropWithCUDA(
|
||||
dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
|
||||
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
||||
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||
cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
|
||||
wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),
|
||||
|
@ -155,24 +155,24 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
|
||||
// test valid inputs
|
||||
#define TEST_LAUNCH_PARAMETER(work_element_count) \
|
||||
cfg = GetGpuLaunchConfig(bufsize, d); \
|
||||
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
outbuf)); \
|
||||
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpuLaunchConfig(work_element_count, d); \
|
||||
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
||||
\
|
||||
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
outbuf)); \
|
||||
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
|
||||
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||
|
||||
@ -206,29 +206,27 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
|
||||
GpuLaunchConfig cfg1d;
|
||||
|
||||
// test valid inputs
|
||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
||||
\
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
||||
\
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||
|
||||
TEST_LAUNCH_PARAMETER(128, 128);
|
||||
@ -250,17 +248,16 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
|
||||
GpuLaunchConfig cfg1d;
|
||||
|
||||
// test valid inputs
|
||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
||||
TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
|
||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||
bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
|
||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||
outbuf)); \
|
||||
CUDA_ASSERT_SUCCESS \
|
||||
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
||||
TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
|
||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||
CUDA_EXPECT_SUCCESS \
|
||||
EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||
|
||||
TEST_LAUNCH_PARAMETER(128, 128, 128);
|
||||
@ -282,8 +279,8 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
|
||||
unsigned* failure_count;
|
||||
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
|
||||
*failure_count = 0;
|
||||
TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
|
||||
failure_count));
|
||||
TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
|
||||
failure_count));
|
||||
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
||||
ASSERT_EQ(*failure_count, 0);
|
||||
cudaFree(failure_count);
|
||||
|
@ -27,8 +27,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
|
||||
}
|
||||
|
||||
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
||||
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||
in, N, out));
|
||||
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||
in, N, out));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -26,8 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
|
||||
}
|
||||
|
||||
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
||||
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||
in, N, out));
|
||||
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||
in, N, out));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user