diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc index e211fa22771..e234fb87b7a 100644 --- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc +++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc @@ -140,8 +140,8 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device, const GpuDeviceArrayStruct& output_scan, bool fixed_size, int split_size, typename TTypes::Matrix* output) { - auto config = GetCuda2DLaunchConfig(output->dimension(1), - output->dimension(0), gpu_device); + auto config = GetGpu2DLaunchConfig(output->dimension(1), output->dimension(0), + gpu_device); if (fixed_size) { TF_CHECK_OK(CudaLaunchKernel( diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc index 368239477b1..558c6439bb8 100644 --- a/tensorflow/core/kernels/split_lib_gpu.cu.cc +++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc @@ -222,7 +222,7 @@ void SplitVOpGPULaunch::Run( gpu_device.stream(), input_ptr, total_rows, total_cols, output_ptr_data)); } else { - auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device); + auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device); IntType smem_max = gpu_device.sharedMemPerBlock(); IntType smem_usage = output_scan.size * sizeof(IntType); // performance crossover is less than using maximum available shared diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc index 3f51820cd55..2f7ba18dd4a 100644 --- a/tensorflow/core/kernels/svd_op_gpu.cu.cc +++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc @@ -59,7 +59,7 @@ namespace { // The result is stored in V[batch] and has the same sign as the // real value of V (which should be computed) template -__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m, +__global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m, int64 ldu, const Scalar* M, const Scalar* U, const Scalar* S, Scalar* V) { @@ -195,7 +195,7 @@ class SvdOpGpu : public AsyncOpKernel { // 1. compute the (batched) sum const GPUDevice& d = context->eigen_device(); d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar)); - Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d); + Gpu2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d); TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel, cfg2D.block_count, cfg2D.thread_per_block, 0, d.stream(), cfg2D, m, full_matrices_ ? m : p, diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc index c3becb1509a..1633f9e8907 100644 --- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc @@ -54,7 +54,7 @@ __global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) { atomicAdd(&outbuf[x % bufsize], 1); } } -__global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) { +__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) { CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { if (x < 0) { // x might overflow when testing extreme case break; @@ -68,7 +68,7 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) { } } } -__global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) { +__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) { CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { if (x < 0) { // x might overflow when testing extreme case break; @@ -189,7 +189,7 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) { #undef TEST_LAUNCH_PARAMETER } -bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) { +bool operator==(const Gpu2DLaunchConfig& a, const Cuda2DLaunchConfig& b) { return a.thread_per_block.x == b.thread_per_block.x && a.thread_per_block.y == b.thread_per_block.y && a.thread_per_block.z == b.thread_per_block.z && @@ -201,8 +201,8 @@ bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) { a.thread_per_block.z == b.thread_per_block.z; } -TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) { - Cuda2DLaunchConfig cfg; +TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) { + Gpu2DLaunchConfig cfg; GpuLaunchConfig cfg1d; // test valid inputs @@ -212,7 +212,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) { cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ outbuf)); \ CUDA_ASSERT_SUCCESS \ - cfg = GetCuda2DLaunchConfig(dimx, dimy, d); \ + cfg = GetGpu2DLaunchConfig(dimx, dimy, d); \ TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ cfg.thread_per_block, 0, d.stream(), cfg, \ bufsize, outbuf)); \ @@ -224,7 +224,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) { cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ outbuf)); \ CUDA_ASSERT_SUCCESS \ - cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ + cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0); \ TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count, \ cfg.thread_per_block, 0, d.stream(), cfg, \ bufsize, outbuf)); \ @@ -245,8 +245,8 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) { #undef TEST_LAUNCH_PARAMETER } -TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) { - Cuda3DLaunchConfig cfg; +TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) { + Gpu3DLaunchConfig cfg; GpuLaunchConfig cfg1d; // test valid inputs @@ -256,7 +256,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) { cfg1d.thread_per_block, 0, d.stream(), cfg1d, \ outbuf)); \ CUDA_ASSERT_SUCCESS \ - cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ + cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0); \ TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count, \ cfg.thread_per_block, 0, d.stream(), cfg, \ bufsize, outbuf)); \