diff --git a/tensorflow/core/util/gpu_cuda_alias.h b/tensorflow/core/util/gpu_cuda_alias.h
index 5a05700d34a..0a15d15e04a 100644
--- a/tensorflow/core/util/gpu_cuda_alias.h
+++ b/tensorflow/core/util/gpu_cuda_alias.h
@@ -17,14 +17,14 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
 
 // Several forwarding macros are defined in this file to serve for backward
-// compatibility usage as we migrating from Cuda prefixed function to Gpu
-// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
-// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
-// function. Over time, all Cuda* functions will be deprecated.
+// compatibility usage as we migrating from CUDA prefixed function to GPU
+// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
+// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
+// function. Over time, all CUDA* functions will be deprecated.
 
 namespace tensorflow {
 
-// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
   template <typename... Args>                             \
@@ -36,7 +36,7 @@ namespace tensorflow {
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
 #endif
 
-// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
 // Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
@@ -49,7 +49,7 @@ namespace tensorflow {
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
 #endif
 
-// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
+// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
 #else
diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 3cc8a20e504..de7a292e88d 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -41,16 +41,36 @@ limitations under the License.
 #define gpuSuccess cudaSuccess
 using gpuStream_t = cudaStream_t;
 using gpuError_t = cudaError_t;
-
 #elif TENSORFLOW_USE_ROCM
 #define gpuSuccess hipSuccess
 using gpuStream_t = hipStream_t;
 using gpuError_t = hipError_t;
 #endif
 
-#define GetGPUStream(context) context->eigen_gpu_device().stream()
-
 namespace tensorflow {
+#if GOOGLE_CUDA
+// cudaGetErrorString is available to both host and device
+__host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) {
+  return cudaGetErrorString(error);
+#elif TENSORFLOW_USE_ROCM
+// hipGetErrorString is available on host side only
+inline const char* GpuGetErrorString(hipError_t error) {
+  return hipGetErrorString(error);
+#endif
+}
+
+inline const gpuStream_t& GetGpuStream(OpKernelContext* context) {
+  // Returns a raw reference to the current cuda stream. Required by a
+  // number of kernel calls (for which StreamInterface* does not work),
+  // i.e. CUB and certain cublas primitives.
+  const gpuStream_t* ptr = CHECK_NOTNULL(
+      reinterpret_cast<const gpuStream_t*>(context->op_device_context()
+                                               ->stream()
+                                               ->implementation()
+                                               ->GpuStreamMemberHack()));
+  return *ptr;
+}
+
 __host__ __device__ inline tensorflow::bfloat16 CudaLdg(
     const tensorflow::bfloat16* address) {
   tensorflow::bfloat16 return_value;
diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h
index 2d08e2b988e..75759526e3c 100644
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@@ -193,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
   config.block_count = block_count;
   return config;
 }
-template <typename DeviceFunc>
-CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                     const Eigen::GpuDevice& d, DeviceFunc func,
-                                     size_t dynamic_shared_memory_size,
-                                     int block_size_limit) {
-  return GetGpuLaunchConfig(work_element_count, d, func,
-                            dynamic_shared_memory_size, block_size_limit);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
 
 // Calculate the GPU launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
@@ -245,14 +238,8 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
   config.block_count = block_count;
   return config;
 }
-template <typename DeviceFunc>
-CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
-    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int fixed_block_size) {
-  return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
-                                          dynamic_shared_memory_size,
-                                          fixed_block_size);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize,
+                                GetCudaLaunchConfigFixedBlockSize);
 
 struct Gpu2DLaunchConfig {
   dim3 virtual_thread_count = dim3(0, 0, 0);
@@ -369,15 +356,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
   config.block_count = dim3(blocksx, blocksy, blocksz);
   return config;
 }
-template <typename DeviceFunc>
-Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
-                                         const Eigen::GpuDevice& d,
-                                         DeviceFunc func,
-                                         size_t dynamic_shared_memory_size,
-                                         int block_size_limit) {
-  return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
-                              dynamic_shared_memory_size, block_size_limit);
-}
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);
 
 template <typename DeviceFunc>
 Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
@@ -388,6 +367,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
   return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
                               dynamic_shared_memory_size, block_size_limit);
 }
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);
 
 #if GOOGLE_CUDA
 // Returns a raw reference to the current cuda stream.  Required by a