diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index e211fa22771..e234fb87b7a 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -140,8 +140,8 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
                    const GpuDeviceArrayStruct<IntType>& output_scan,
                    bool fixed_size, int split_size,
                    typename TTypes<T, 2>::Matrix* output) {
-  auto config = GetCuda2DLaunchConfig(output->dimension(1),
-                                      output->dimension(0), gpu_device);
+  auto config = GetGpu2DLaunchConfig(output->dimension(1), output->dimension(0),
+                                     gpu_device);
 
   if (fixed_size) {
     TF_CHECK_OK(CudaLaunchKernel(
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 368239477b1..558c6439bb8 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -222,7 +222,7 @@ void SplitVOpGPULaunch<T, IntType>::Run(
                                  gpu_device.stream(), input_ptr, total_rows,
                                  total_cols, output_ptr_data));
   } else {
-    auto config = GetCuda2DLaunchConfig(total_cols, total_rows, gpu_device);
+    auto config = GetGpu2DLaunchConfig(total_cols, total_rows, gpu_device);
     IntType smem_max = gpu_device.sharedMemPerBlock();
     IntType smem_usage = output_scan.size * sizeof(IntType);
     // performance crossover is less than using maximum available shared
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 3f51820cd55..2f7ba18dd4a 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -59,7 +59,7 @@ namespace {
 // The result is stored in V[batch] and has the same sign as the
 // real value of V (which should be computed)
 template <class Scalar>
-__global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
+__global__ void ComputeValueOfVKernel(Gpu2DLaunchConfig config, int64 m,
                                       int64 ldu, const Scalar* M,
                                       const Scalar* U, const Scalar* S,
                                       Scalar* V) {
@@ -195,7 +195,7 @@ class SvdOpGpu : public AsyncOpKernel {
       // 1. compute the (batched) sum
       const GPUDevice& d = context->eigen_device<GPUDevice>();
       d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
-      Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
+      Gpu2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
       TF_CHECK_OK(CudaLaunchKernel(ComputeValueOfVKernel<Scalar>,
                                    cfg2D.block_count, cfg2D.thread_per_block, 0,
                                    d.stream(), cfg2D, m, full_matrices_ ? m : p,
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index c3becb1509a..1633f9e8907 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -54,7 +54,7 @@ __global__ void Count1D(GpuLaunchConfig config, int bufsize, int* outbuf) {
     atomicAdd(&outbuf[x % bufsize], 1);
   }
 }
-__global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count2D(Gpu2DLaunchConfig config, int bufsize, int* outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -68,7 +68,7 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
     }
   }
 }
-__global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
+__global__ void Count3D(Gpu3DLaunchConfig config, int bufsize, int* outbuf) {
   CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
@@ -189,7 +189,7 @@ TEST_F(GpuLaunchConfigTest, GetGpuLaunchConfig) {
 #undef TEST_LAUNCH_PARAMETER
 }
 
-bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
+bool operator==(const Gpu2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
   return a.thread_per_block.x == b.thread_per_block.x &&
          a.thread_per_block.y == b.thread_per_block.y &&
          a.thread_per_block.z == b.thread_per_block.z &&
@@ -201,8 +201,8 @@ bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
          a.thread_per_block.z == b.thread_per_block.z;
 }
 
-TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
-  Cuda2DLaunchConfig cfg;
+TEST_F(GpuLaunchConfigTest, GetGpu2DLaunchConfig) {
+  Gpu2DLaunchConfig cfg;
   GpuLaunchConfig cfg1d;
 
 // test valid inputs
@@ -212,7 +212,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
+  cfg = GetGpu2DLaunchConfig(dimx, dimy, d);                                  \
   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \
@@ -224,7 +224,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
+  cfg = GetGpu2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
   TF_EXPECT_OK(CudaLaunchKernel(Count2D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \
@@ -245,8 +245,8 @@ TEST_F(GpuLaunchConfigTest, GetCuda2DLaunchConfig) {
 #undef TEST_LAUNCH_PARAMETER
 }
 
-TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
-  Cuda3DLaunchConfig cfg;
+TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
+  Gpu3DLaunchConfig cfg;
   GpuLaunchConfig cfg1d;
 
 // test valid inputs
@@ -256,7 +256,7 @@ TEST_F(GpuLaunchConfigTest, GetCuda3DLaunchConfig) {
                                 cfg1d.thread_per_block, 0, d.stream(), cfg1d, \
                                 outbuf));                                     \
   CUDA_ASSERT_SUCCESS                                                         \
-  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
+  cfg = GetGpu3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
   TF_EXPECT_OK(CudaLaunchKernel(Count3D, cfg.block_count,                     \
                                 cfg.thread_per_block, 0, d.stream(), cfg,     \
                                 bufsize, outbuf));                            \