diff --git a/tensorflow/core/util/gpu_cuda_alias.h b/tensorflow/core/util/gpu_cuda_alias.h
index 5a05700d34a..0a15d15e04a 100644
--- a/tensorflow/core/util/gpu_cuda_alias.h
+++ b/tensorflow/core/util/gpu_cuda_alias.h
@@ -17,14 +17,14 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
 
 // Several forwarding macros are defined in this file to serve for backward
-// compatibility usage as we migrating from Cuda prefixed function to Gpu
-// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
-// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
-// function. Over time, all Cuda* functions will be deprecated.
+// compatibility usage as we migrating from CUDA prefixed function to GPU
+// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
+// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
+// function. Over time, all CUDA* functions will be deprecated.
 
 namespace tensorflow {
 
-// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
   template <typename... Args>                             \
@@ -36,7 +36,7 @@ namespace tensorflow {
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
 #endif
 
-// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
 // Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
@@ -49,7 +49,7 @@ namespace tensorflow {
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
 #endif
 
-// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
+// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
 #else
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index e4a20e2f377..ae351e7889b 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_cuda_alias.h"
 
 // Usage of GetGpuLaunchConfig, GetGpu2DLaunchConfig, and
 // GetGpu3DLaunchConfig:
@@ -192,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
   config.block_count = block_count;
   return config;
 }
-template <typename DeviceFunc>
-CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                     const Eigen::GpuDevice& d, DeviceFunc func,
-                                     size_t dynamic_shared_memory_size,
-                                     int block_size_limit) {
-  return GetGpuLaunchConfig(work_element_count, d, func,
-                            dynamic_shared_memory_size, block_size_limit);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
 
 // Calculate the Cuda launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
@@ -244,14 +238,7 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   config.block_count = block_count;
   return config;
 }
-template <typename DeviceFunc>
-CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
-    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int fixed_block_size) {
-  return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
-                                          dynamic_shared_memory_size,
-                                          fixed_block_size);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize, GetCudaLaunchConfigFixedBlockSize);
 
 struct Gpu2DLaunchConfig {
   dim3 virtual_thread_count = dim3(0, 0, 0);
@@ -368,15 +355,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
   config.block_count = dim3(blocksx, blocksy, blocksz);
   return config;
 }
-template <typename DeviceFunc>
-Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
-                                         const Eigen::GpuDevice& d,
-                                         DeviceFunc func,
-                                         size_t dynamic_shared_memory_size,
-                                         int block_size_limit) {
-  return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
-                              dynamic_shared_memory_size, block_size_limit);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);
 
 template <typename DeviceFunc>
 Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
@@ -387,6 +366,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
   return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
                               dynamic_shared_memory_size, block_size_limit);
 }
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);
 
 #if GOOGLE_CUDA
 // Returns a raw reference to the current cuda stream.  Required by a