diff --git a/tensorflow/core/util/gpu_cuda_alias.h b/tensorflow/core/util/gpu_cuda_alias.h index 5a05700d34a..0a15d15e04a 100644 --- a/tensorflow/core/util/gpu_cuda_alias.h +++ b/tensorflow/core/util/gpu_cuda_alias.h @@ -17,14 +17,14 @@ limitations under the License. #define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_ // Several forwarding macros are defined in this file to serve for backward -// compatibility usage as we migrating from Cuda prefixed function to Gpu -// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix -// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu* -// function. Over time, all Cuda* functions will be deprecated. +// compatibility usage as we migrating from CUDA prefixed function to GPU +// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix +// naming scheme. In the migration period, we provide equivalent CUDA* and GPU* +// function. Over time, all CUDA* functions will be deprecated. namespace tensorflow { -// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias. +// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias. #ifndef TENSORFLOW_USE_ROCM #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \ template \ @@ -36,7 +36,7 @@ namespace tensorflow { #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) #endif -// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda +// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA // Alias. #ifndef TENSORFLOW_USE_ROCM #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \ @@ -49,7 +49,7 @@ namespace tensorflow { #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) #endif -// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias. +// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias. #ifndef TENSORFLOW_USE_ROCM #define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type; #else diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h index 3cc8a20e504..de7a292e88d 100644 --- a/tensorflow/core/util/gpu_kernel_helper.h +++ b/tensorflow/core/util/gpu_kernel_helper.h @@ -41,16 +41,36 @@ limitations under the License. #define gpuSuccess cudaSuccess using gpuStream_t = cudaStream_t; using gpuError_t = cudaError_t; - #elif TENSORFLOW_USE_ROCM #define gpuSuccess hipSuccess using gpuStream_t = hipStream_t; using gpuError_t = hipError_t; #endif -#define GetGPUStream(context) context->eigen_gpu_device().stream() - namespace tensorflow { +#if GOOGLE_CUDA +// cudaGetErrorString is available to both host and device +__host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) { + return cudaGetErrorString(error); +#elif TENSORFLOW_USE_ROCM +// hipGetErrorString is available on host side only +inline const char* GpuGetErrorString(hipError_t error) { + return hipGetErrorString(error); +#endif +} + +inline const gpuStream_t& GetGpuStream(OpKernelContext* context) { + // Returns a raw reference to the current cuda stream. Required by a + // number of kernel calls (for which StreamInterface* does not work), + // i.e. CUB and certain cublas primitives. + const gpuStream_t* ptr = CHECK_NOTNULL( + reinterpret_cast(context->op_device_context() + ->stream() + ->implementation() + ->GpuStreamMemberHack())); + return *ptr; +} + __host__ __device__ inline tensorflow::bfloat16 CudaLdg( const tensorflow::bfloat16* address) { tensorflow::bfloat16 return_value; diff --git a/tensorflow/core/util/gpu_launch_config.h b/tensorflow/core/util/gpu_launch_config.h index 2d08e2b988e..75759526e3c 100644 --- a/tensorflow/core/util/gpu_launch_config.h +++ b/tensorflow/core/util/gpu_launch_config.h @@ -193,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count, config.block_count = block_count; return config; } -template -CudaLaunchConfig GetCudaLaunchConfig(int work_element_count, - const Eigen::GpuDevice& d, DeviceFunc func, - size_t dynamic_shared_memory_size, - int block_size_limit) { - return GetGpuLaunchConfig(work_element_count, d, func, - dynamic_shared_memory_size, block_size_limit); -} +CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig); // Calculate the GPU launch config we should use for a kernel launch. This // variant takes the resource limits of func into account to maximize occupancy. @@ -245,14 +238,8 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize( config.block_count = block_count; return config; } -template -CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize( - int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func, - size_t dynamic_shared_memory_size, int fixed_block_size) { - return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func, - dynamic_shared_memory_size, - fixed_block_size); -} +CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize, + GetCudaLaunchConfigFixedBlockSize); struct Gpu2DLaunchConfig { dim3 virtual_thread_count = dim3(0, 0, 0); @@ -369,15 +356,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim, config.block_count = dim3(blocksx, blocksy, blocksz); return config; } -template -Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim, - const Eigen::GpuDevice& d, - DeviceFunc func, - size_t dynamic_shared_memory_size, - int block_size_limit) { - return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func, - dynamic_shared_memory_size, block_size_limit); -} +CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig); template Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim, @@ -388,6 +367,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim, return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func, dynamic_shared_memory_size, block_size_limit); } +CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig); #if GOOGLE_CUDA // Returns a raw reference to the current cuda stream. Required by a