Merge pull request #28568 from ROCmSoftwarePlatform:google-upstream-pr-cuda-host-alias

PiperOrigin-RevId: 247976625
2019-05-13 12:24:48 -07:00 · 2019-05-13 12:24:48 -07:00 · 51b572c7cf
commit 51b572c7cf
parent cd69d0e907 9c29bbf38c
3 changed files with 35 additions and 35 deletions
--- a/tensorflow/core/util/gpu_cuda_alias.h
+++ b/tensorflow/core/util/gpu_cuda_alias.h
@ -17,14 +17,14 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
 // Several forwarding macros are defined in this file to serve for backward
-// compatibility usage as we migrating from Cuda prefixed function to Gpu
+// compatibility usage as we migrating from CUDA prefixed function to GPU
-// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
+// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
-// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
+// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
-// function. Over time, all Cuda* functions will be deprecated.
+// function. Over time, all CUDA* functions will be deprecated.
 namespace tensorflow {
-// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
+// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
  template <typename... Args>                             \
@ -36,7 +36,7 @@ namespace tensorflow {
 #define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
 #endif
-// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
+// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
 // Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
@ -49,7 +49,7 @@ namespace tensorflow {
 #define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
 #endif
-// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
+// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
 #ifndef TENSORFLOW_USE_ROCM
 #define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
 #else
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@ -41,16 +41,36 @@ limitations under the License.
 #define gpuSuccess cudaSuccess
 using gpuStream_t = cudaStream_t;
 using gpuError_t = cudaError_t;
 #elif TENSORFLOW_USE_ROCM
 #define gpuSuccess hipSuccess
 using gpuStream_t = hipStream_t;
 using gpuError_t = hipError_t;
 #endif
 #define GetGPUStream(context) context->eigen_gpu_device().stream()
 namespace tensorflow {
 #if GOOGLE_CUDA
 // cudaGetErrorString is available to both host and device
 __host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) {
  return cudaGetErrorString(error);
 #elif TENSORFLOW_USE_ROCM
 // hipGetErrorString is available on host side only
 inline const char* GpuGetErrorString(hipError_t error) {
  return hipGetErrorString(error);
 #endif
 }
 inline const gpuStream_t& GetGpuStream(OpKernelContext* context) {
  // Returns a raw reference to the current cuda stream. Required by a
  // number of kernel calls (for which StreamInterface* does not work),
  // i.e. CUB and certain cublas primitives.
  const gpuStream_t* ptr = CHECK_NOTNULL(
      reinterpret_cast<const gpuStream_t*>(context->op_device_context()
                                               ->stream()
                                               ->implementation()
                                               ->GpuStreamMemberHack()));
  return *ptr;
 }
 __host__ __device__ inline tensorflow::bfloat16 CudaLdg(
    const tensorflow::bfloat16* address) {
  tensorflow::bfloat16 return_value;
--- a/tensorflow/core/util/gpu_launch_config.h
+++ b/tensorflow/core/util/gpu_launch_config.h
@ -193,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
  config.block_count = block_count;
  return config;
 }
-template <typename DeviceFunc>
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
 CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
                                     const Eigen::GpuDevice& d, DeviceFunc func,
                                     size_t dynamic_shared_memory_size,
                                     int block_size_limit) {
  return GetGpuLaunchConfig(work_element_count, d, func,
                            dynamic_shared_memory_size, block_size_limit);
 }
 // Calculate the GPU launch config we should use for a kernel launch. This
 // variant takes the resource limits of func into account to maximize occupancy.
@ -245,14 +238,8 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
  config.block_count = block_count;
  return config;
 }
-template <typename DeviceFunc>
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize,
-CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
+                                GetCudaLaunchConfigFixedBlockSize);
    int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
    size_t dynamic_shared_memory_size, int fixed_block_size) {
  return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
                                          dynamic_shared_memory_size,
                                          fixed_block_size);
 }
 struct Gpu2DLaunchConfig {
  dim3 virtual_thread_count = dim3(0, 0, 0);
@ -369,15 +356,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
  config.block_count = dim3(blocksx, blocksy, blocksz);
  return config;
 }
-template <typename DeviceFunc>
+CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);
 Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
                                         const Eigen::GpuDevice& d,
                                         DeviceFunc func,
                                         size_t dynamic_shared_memory_size,
                                         int block_size_limit) {
  return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
                              dynamic_shared_memory_size, block_size_limit);
 }
 template <typename DeviceFunc>
 Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
@ -388,6 +367,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
  return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
                              dynamic_shared_memory_size, block_size_limit);
 }
 CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);
 #if GOOGLE_CUDA
 // Returns a raw reference to the current cuda stream.  Required by a