Replacing GetCudaLaunchConfig and CudaLaunchKernel with their Gpu equivalent.
PiperOrigin-RevId: 256648520
This commit is contained in:
parent
2e00e03972
commit
450a690cfe
@ -241,9 +241,9 @@ void LSTMBlockCellFpropWithCUDA(
|
|||||||
const int block_dim = 128;
|
const int block_dim = 128;
|
||||||
const int grid_dim =
|
const int grid_dim =
|
||||||
Eigen::divup(batch_size * (cell_size + input_size), block_dim);
|
Eigen::divup(batch_size * (cell_size + input_size), block_dim);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
|
TF_CHECK_OK(GpuLaunchKernel(concat_xh<T>, grid_dim, block_dim, 0, cu_stream,
|
||||||
xh.data(), x.data(), h_prev.data(), batch_size,
|
xh.data(), x.data(), h_prev.data(), batch_size,
|
||||||
cell_size, input_size));
|
cell_size, input_size));
|
||||||
|
|
||||||
// states1 = xh * w
|
// states1 = xh * w
|
||||||
typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
|
typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
|
||||||
@ -261,13 +261,13 @@ void LSTMBlockCellFpropWithCUDA(
|
|||||||
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
||||||
|
|
||||||
if (use_peephole) {
|
if (use_peephole) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
lstm_gates<T, true>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||||
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
||||||
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
||||||
i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
|
i.data(), f.data(), forget_bias, cell_clip, batch_size, cell_size));
|
||||||
} else {
|
} else {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
lstm_gates<T, false>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||||
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
icfo.data(), b.data(), cs_prev.data(), wci.data(), wcf.data(),
|
||||||
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
wco.data(), o.data(), h.data(), ci.data(), cs.data(), co.data(),
|
||||||
@ -376,7 +376,7 @@ void LSTMBlockCellBpropWithCUDA(
|
|||||||
dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
|
dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
|
||||||
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
Eigen::divup(cell_size, static_cast<int>(block_dim_2d.y)));
|
||||||
|
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
lstm_gates_bprop<T>, grid_dim_2d, block_dim_2d, 0, cu_stream,
|
||||||
cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
|
cs_prev.data(), h_prev.data(), w.data(), wci.data(), wcf.data(),
|
||||||
wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),
|
wco.data(), b.data(), i.data(), cs.data(), f.data(), o.data(), ci.data(),
|
||||||
|
@ -155,24 +155,24 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
|
|||||||
// test valid inputs
|
// test valid inputs
|
||||||
#define TEST_LAUNCH_PARAMETER(work_element_count) \
|
#define TEST_LAUNCH_PARAMETER(work_element_count) \
|
||||||
cfg = GetGpuLaunchConfig(bufsize, d); \
|
cfg = GetGpuLaunchConfig(bufsize, d); \
|
||||||
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
|
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetGpuLaunchConfig(work_element_count, d); \
|
cfg = GetGpuLaunchConfig(work_element_count, d); \
|
||||||
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
||||||
\
|
\
|
||||||
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
cfg = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||||
TF_CHECK_OK(CudaLaunchKernel(SetOutbufZero, cfg.block_count, \
|
TF_CHECK_OK(GpuLaunchKernel(SetOutbufZero, cfg.block_count, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
cfg.thread_per_block, 0, d.stream(), cfg, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
|
cfg = GetGpuLaunchConfig(work_element_count, d, Count1D, 0, 0); \
|
||||||
TF_CHECK_OK(CudaLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
TF_CHECK_OK(GpuLaunchKernel(Count1D, cfg.block_count, cfg.thread_per_block, \
|
||||||
0, d.stream(), cfg, bufsize, outbuf)); \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
CUDA_EXPECT_SUCCESS \
|
CUDA_EXPECT_SUCCESS \
|
||||||
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
|
EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||||
|
|
||||||
@ -206,29 +206,27 @@ TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
|
|||||||
GpuLaunchConfig cfg1d;
|
GpuLaunchConfig cfg1d;
|
||||||
|
|
||||||
// test valid inputs
|
// test valid inputs
|
||||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
|
#define TEST_LAUNCH_PARAMETER(dimx, dimy) \
|
||||||
cfg1d = GetGpuLaunchConfig(bufsize, d); \
|
cfg1d = GetGpuLaunchConfig(bufsize, d); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
|
cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
bufsize, outbuf)); \
|
CUDA_EXPECT_SUCCESS \
|
||||||
CUDA_EXPECT_SUCCESS \
|
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
||||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0)); \
|
\
|
||||||
\
|
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
outbuf)); \
|
||||||
outbuf)); \
|
CUDA_ASSERT_SUCCESS \
|
||||||
CUDA_ASSERT_SUCCESS \
|
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
||||||
cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \
|
TF_EXPECT_OK(GpuLaunchKernel(Count2D, cfg.block_count, cfg.thread_per_block, \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
CUDA_EXPECT_SUCCESS \
|
||||||
bufsize, outbuf)); \
|
|
||||||
CUDA_EXPECT_SUCCESS \
|
|
||||||
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
|
EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||||
|
|
||||||
TEST_LAUNCH_PARAMETER(128, 128);
|
TEST_LAUNCH_PARAMETER(128, 128);
|
||||||
@ -250,17 +248,16 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
|
|||||||
GpuLaunchConfig cfg1d;
|
GpuLaunchConfig cfg1d;
|
||||||
|
|
||||||
// test valid inputs
|
// test valid inputs
|
||||||
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
|
#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz) \
|
||||||
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
cfg1d = GetGpuLaunchConfig(bufsize, d, SetOutbufZero, 0, 0); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
TF_EXPECT_OK(GpuLaunchKernel(SetOutbufZero, cfg1d.block_count, \
|
||||||
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
|
||||||
outbuf)); \
|
outbuf)); \
|
||||||
CUDA_ASSERT_SUCCESS \
|
CUDA_ASSERT_SUCCESS \
|
||||||
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \
|
TF_EXPECT_OK(GpuLaunchKernel(Count3D, cfg.block_count, cfg.thread_per_block, \
|
||||||
cfg.thread_per_block, 0, d.stream(), cfg, \
|
0, d.stream(), cfg, bufsize, outbuf)); \
|
||||||
bufsize, outbuf)); \
|
CUDA_EXPECT_SUCCESS \
|
||||||
CUDA_EXPECT_SUCCESS \
|
|
||||||
EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
|
EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
|
||||||
|
|
||||||
TEST_LAUNCH_PARAMETER(128, 128, 128);
|
TEST_LAUNCH_PARAMETER(128, 128, 128);
|
||||||
@ -282,8 +279,8 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
|
|||||||
unsigned* failure_count;
|
unsigned* failure_count;
|
||||||
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
|
ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
|
||||||
*failure_count = 0;
|
*failure_count = 0;
|
||||||
TF_EXPECT_OK(CudaLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
|
TF_EXPECT_OK(GpuLaunchKernel(CudaShuffleGetSrcLaneTest, 1, 32, 0, nullptr,
|
||||||
failure_count));
|
failure_count));
|
||||||
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
|
||||||
ASSERT_EQ(*failure_count, 0);
|
ASSERT_EQ(*failure_count, 0);
|
||||||
cudaFree(failure_count);
|
cudaFree(failure_count);
|
||||||
|
@ -27,8 +27,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
||||||
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||||
in, N, out));
|
in, N, out));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -26,8 +26,8 @@ __global__ void AddOneKernel(const int* in, const int N, int* out) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
void AddOneKernelLauncher(const int* in, const int N, int* out) {
|
||||||
TF_CHECK_OK(::tensorflow::CudaLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
TF_CHECK_OK(::tensorflow::GpuLaunchKernel(AddOneKernel, 32, 256, 0, nullptr,
|
||||||
in, N, out));
|
in, N, out));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user