Merge pull request #28568 from ROCmSoftwarePlatform:google-upstream-pr-cuda-host-alias
PiperOrigin-RevId: 247976625
This commit is contained in:
commit
51b572c7cf
@ -17,14 +17,14 @@ limitations under the License.
|
|||||||
#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
|
#define TENSORFLOW_CORE_UTIL_GPU_CUDA_ALIAS_H_
|
||||||
|
|
||||||
// Several forwarding macros are defined in this file to serve for backward
|
// Several forwarding macros are defined in this file to serve for backward
|
||||||
// compatibility usage as we migrating from Cuda prefixed function to Gpu
|
// compatibility usage as we migrating from CUDA prefixed function to GPU
|
||||||
// prefixed functions. Both Cuda and ROCm can unify under the new Gpu prefix
|
// prefixed functions. Both Cuda and ROCm can unify under the new GPU prefix
|
||||||
// naming scheme. In the migration period, we provide equivalent Cuda* and Gpu*
|
// naming scheme. In the migration period, we provide equivalent CUDA* and GPU*
|
||||||
// function. Over time, all Cuda* functions will be deprecated.
|
// function. Over time, all CUDA* functions will be deprecated.
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
|
||||||
// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its Cuda Alias.
|
// CREATE_CUDA_HOST_FUNCTION_ALIAS forward the host function to its CUDA Alias.
|
||||||
#ifndef TENSORFLOW_USE_ROCM
|
#ifndef TENSORFLOW_USE_ROCM
|
||||||
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
|
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias) \
|
||||||
template <typename... Args> \
|
template <typename... Args> \
|
||||||
@ -36,7 +36,7 @@ namespace tensorflow {
|
|||||||
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
|
#define CREATE_CUDA_HOST_FUNCTION_ALIAS(func, cuda_alias)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its Cuda
|
// CREATE_CUDA_DEVICE_FUNCTION_ALIAS forward the device function to its CUDA
|
||||||
// Alias.
|
// Alias.
|
||||||
#ifndef TENSORFLOW_USE_ROCM
|
#ifndef TENSORFLOW_USE_ROCM
|
||||||
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
|
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias) \
|
||||||
@ -49,7 +49,7 @@ namespace tensorflow {
|
|||||||
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
|
#define CREATE_CUDA_DEVICE_FUNCTION_ALIAS(func, cuda_alias)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// CREATE_CUDA_TYPE_ALIAS forward the type to its Cuda Alias.
|
// CREATE_CUDA_TYPE_ALIAS forward the type to its CUDA Alias.
|
||||||
#ifndef TENSORFLOW_USE_ROCM
|
#ifndef TENSORFLOW_USE_ROCM
|
||||||
#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
|
#define CREATE_CUDA_TYPE_ALIAS(type, cuda_alias) using cuda_alias = type;
|
||||||
#else
|
#else
|
||||||
|
@ -41,16 +41,36 @@ limitations under the License.
|
|||||||
#define gpuSuccess cudaSuccess
|
#define gpuSuccess cudaSuccess
|
||||||
using gpuStream_t = cudaStream_t;
|
using gpuStream_t = cudaStream_t;
|
||||||
using gpuError_t = cudaError_t;
|
using gpuError_t = cudaError_t;
|
||||||
|
|
||||||
#elif TENSORFLOW_USE_ROCM
|
#elif TENSORFLOW_USE_ROCM
|
||||||
#define gpuSuccess hipSuccess
|
#define gpuSuccess hipSuccess
|
||||||
using gpuStream_t = hipStream_t;
|
using gpuStream_t = hipStream_t;
|
||||||
using gpuError_t = hipError_t;
|
using gpuError_t = hipError_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GetGPUStream(context) context->eigen_gpu_device().stream()
|
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
|
#if GOOGLE_CUDA
|
||||||
|
// cudaGetErrorString is available to both host and device
|
||||||
|
__host__ __device__ inline const char* GpuGetErrorString(cudaError_t error) {
|
||||||
|
return cudaGetErrorString(error);
|
||||||
|
#elif TENSORFLOW_USE_ROCM
|
||||||
|
// hipGetErrorString is available on host side only
|
||||||
|
inline const char* GpuGetErrorString(hipError_t error) {
|
||||||
|
return hipGetErrorString(error);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
inline const gpuStream_t& GetGpuStream(OpKernelContext* context) {
|
||||||
|
// Returns a raw reference to the current cuda stream. Required by a
|
||||||
|
// number of kernel calls (for which StreamInterface* does not work),
|
||||||
|
// i.e. CUB and certain cublas primitives.
|
||||||
|
const gpuStream_t* ptr = CHECK_NOTNULL(
|
||||||
|
reinterpret_cast<const gpuStream_t*>(context->op_device_context()
|
||||||
|
->stream()
|
||||||
|
->implementation()
|
||||||
|
->GpuStreamMemberHack()));
|
||||||
|
return *ptr;
|
||||||
|
}
|
||||||
|
|
||||||
__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
|
__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
|
||||||
const tensorflow::bfloat16* address) {
|
const tensorflow::bfloat16* address) {
|
||||||
tensorflow::bfloat16 return_value;
|
tensorflow::bfloat16 return_value;
|
||||||
|
@ -193,14 +193,7 @@ GpuLaunchConfig GetGpuLaunchConfig(int work_element_count,
|
|||||||
config.block_count = block_count;
|
config.block_count = block_count;
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
template <typename DeviceFunc>
|
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfig, GetCudaLaunchConfig);
|
||||||
CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
|
|
||||||
const Eigen::GpuDevice& d, DeviceFunc func,
|
|
||||||
size_t dynamic_shared_memory_size,
|
|
||||||
int block_size_limit) {
|
|
||||||
return GetGpuLaunchConfig(work_element_count, d, func,
|
|
||||||
dynamic_shared_memory_size, block_size_limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate the GPU launch config we should use for a kernel launch. This
|
// Calculate the GPU launch config we should use for a kernel launch. This
|
||||||
// variant takes the resource limits of func into account to maximize occupancy.
|
// variant takes the resource limits of func into account to maximize occupancy.
|
||||||
@ -245,14 +238,8 @@ GpuLaunchConfig GetGpuLaunchConfigFixedBlockSize(
|
|||||||
config.block_count = block_count;
|
config.block_count = block_count;
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
template <typename DeviceFunc>
|
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpuLaunchConfigFixedBlockSize,
|
||||||
CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
|
GetCudaLaunchConfigFixedBlockSize);
|
||||||
int work_element_count, const Eigen::GpuDevice& d, DeviceFunc func,
|
|
||||||
size_t dynamic_shared_memory_size, int fixed_block_size) {
|
|
||||||
return GetGpuLaunchConfigFixedBlockSize(work_element_count, d, func,
|
|
||||||
dynamic_shared_memory_size,
|
|
||||||
fixed_block_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Gpu2DLaunchConfig {
|
struct Gpu2DLaunchConfig {
|
||||||
dim3 virtual_thread_count = dim3(0, 0, 0);
|
dim3 virtual_thread_count = dim3(0, 0, 0);
|
||||||
@ -369,15 +356,7 @@ Cuda3DLaunchConfig GetGpu3DLaunchConfig(int xdim, int ydim, int zdim,
|
|||||||
config.block_count = dim3(blocksx, blocksy, blocksz);
|
config.block_count = dim3(blocksx, blocksy, blocksz);
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
template <typename DeviceFunc>
|
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu3DLaunchConfig, GetCuda3DLaunchConfig);
|
||||||
Cuda3DLaunchConfig GetCuda3DLaunchConfig(int xdim, int ydim, int zdim,
|
|
||||||
const Eigen::GpuDevice& d,
|
|
||||||
DeviceFunc func,
|
|
||||||
size_t dynamic_shared_memory_size,
|
|
||||||
int block_size_limit) {
|
|
||||||
return GetGpu3DLaunchConfig(xdim, ydim, zdim, d, func,
|
|
||||||
dynamic_shared_memory_size, block_size_limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename DeviceFunc>
|
template <typename DeviceFunc>
|
||||||
Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
|
Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
|
||||||
@ -388,6 +367,7 @@ Gpu2DLaunchConfig GetGpu2DLaunchConfig(int xdim, int ydim,
|
|||||||
return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
|
return GetGpu3DLaunchConfig(xdim, ydim, 1, d, func,
|
||||||
dynamic_shared_memory_size, block_size_limit);
|
dynamic_shared_memory_size, block_size_limit);
|
||||||
}
|
}
|
||||||
|
CREATE_CUDA_HOST_FUNCTION_ALIAS(GetGpu2DLaunchConfig, GetCuda2DLaunchConfig);
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA
|
||||||
// Returns a raw reference to the current cuda stream. Required by a
|
// Returns a raw reference to the current cuda stream. Required by a
|
||||||
|
Loading…
x
Reference in New Issue
Block a user