Add ROCm support for launching 2D GPU convolutions

This commit is contained in:
Wen-Heng (Jack) Chung 2019-05-30 16:01:31 +00:00
parent ef0b1eff8d
commit 82ccb9a50d
8 changed files with 62 additions and 57 deletions

View File

@ -16,7 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_ #ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_ #define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -25,7 +25,9 @@ limitations under the License.
#include <limits> #include <limits>
#include <utility> #include <utility>
#if GOOGLE_CUDA
#include "third_party/gpus/cuda/include/cuda.h" #include "third_party/gpus/cuda/include/cuda.h"
#endif
#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/lib/math/math_util.h" #include "tensorflow/core/lib/math/math_util.h"
@ -49,7 +51,7 @@ struct maybe_conj {
} }
}; };
// Partial specializations for Cuda types used to store complex numbers. // Partial specializations for Gpu types used to store complex numbers.
template <bool conjugate> template <bool conjugate>
struct maybe_conj<float2, conjugate> { struct maybe_conj<float2, conjugate> {
__device__ static __inline__ float2 run(float2 c) { __device__ static __inline__ float2 run(float2 c) {
@ -191,7 +193,7 @@ __global__ void ShuffleInTensor3Simple(int nthreads, const T* input,
// performance. Iterating over output will generate sequential writes and // performance. Iterating over output will generate sequential writes and
// random reads that performs better compared to sequential reads and random // random reads that performs better compared to sequential reads and random
// writes. // writes.
CUDA_1D_KERNEL_LOOP(output_index, nthreads) { GPU_1D_KERNEL_LOOP(output_index, nthreads) {
Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
Index<3> input_tensor_index; Index<3> input_tensor_index;
@ -232,11 +234,15 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
// One extra line in the inner dimension to avoid share memory bank conflict. // One extra line in the inner dimension to avoid share memory bank conflict.
// This is to mimic the following, but no constructor of T can be invoked. // This is to mimic the following, but no constructor of T can be invoked.
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1]; // __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
#if GOOGLE_CUDA
__shared__ __align__( __shared__ __align__(
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)]; alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
typedef T(*SharedMemoryTile)[TileSizeJ + 1]; typedef T(*SharedMemoryTile)[TileSizeJ + 1];
SharedMemoryTile shared_memory_tile = SharedMemoryTile shared_memory_tile =
reinterpret_cast<SharedMemoryTile>(shared_mem_raw); reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
#elif TENSORFLOW_USE_ROCM
__shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
#endif
int x = threadIdx.x; int x = threadIdx.x;
@ -357,14 +363,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
} }
} }
// A Cuda custom kernel that convert input to output, given proper padding on // A Gpu custom kernel that convert input to output, given proper padding on
// the left and the top. The padded value is zero. // the left and the top. The padded value is zero.
template <typename T, int NDIMS> template <typename T, int NDIMS>
__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input, __global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
Dimension<NDIMS> input_dims, T* output, Dimension<NDIMS> input_dims, T* output,
Dimension<NDIMS> output_dims, Dimension<NDIMS> output_dims,
Dimension<NDIMS - 2> padding_left) { Dimension<NDIMS - 2> padding_left) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { GPU_1D_KERNEL_LOOP(index, nthreads) {
int output_index = index; int output_index = index;
Index<NDIMS> output_tensor_index = Index<NDIMS> output_tensor_index =
FlatToTensorIndex(output_index, output_dims); FlatToTensorIndex(output_index, output_dims);
@ -393,7 +399,7 @@ __global__ void PadInputCustomKernelNCHW(int nthreads, const T* input,
Dimension<NDIMS> input_dims, T* output, Dimension<NDIMS> input_dims, T* output,
Dimension<NDIMS> output_dims, Dimension<NDIMS> output_dims,
Dimension<NDIMS - 2> padding_left) { Dimension<NDIMS - 2> padding_left) {
CUDA_1D_KERNEL_LOOP(index, nthreads) { GPU_1D_KERNEL_LOOP(index, nthreads) {
int output_index = index; int output_index = index;
Index<NDIMS> output_tensor_index = Index<NDIMS> output_tensor_index =
FlatToTensorIndex(output_index, output_dims); FlatToTensorIndex(output_index, output_dims);
@ -432,19 +438,19 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
} }
combined_dims[1] = in.dimension(NDIMS - 2); // input filters combined_dims[1] = in.dimension(NDIMS - 2); // input filters
combined_dims[2] = in.dimension(NDIMS - 1); // output filters combined_dims[2] = in.dimension(NDIMS - 1); // output filters
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d); GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
if (dst_filter_format == FORMAT_OIHW) { if (dst_filter_format == FORMAT_OIHW) {
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
config.block_count, config.thread_per_block, config.block_count, config.thread_per_block,
0, d.stream(), config.virtual_thread_count, 0, d.stream(), config.virtual_thread_count,
in.data(), combined_dims, out.data())); in.data(), combined_dims, out.data()));
} else if (dst_filter_format == FORMAT_OHWI) { } else if (dst_filter_format == FORMAT_OHWI) {
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
config.block_count, config.thread_per_block, config.block_count, config.thread_per_block,
0, d.stream(), config.virtual_thread_count, 0, d.stream(), config.virtual_thread_count,
in.data(), combined_dims, out.data())); in.data(), combined_dims, out.data()));
} else { } else {
LOG(ERROR) << "Unsupported filter format: " LOG(ERROR) << "Unsupported filter format: "
@ -471,11 +477,11 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
combined_dims[2] *= in.dimension(i); combined_dims[2] *= in.dimension(i);
} }
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d); GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
config.block_count, config.thread_per_block, config.block_count, config.thread_per_block,
0, d.stream(), config.virtual_thread_count, 0, d.stream(), config.virtual_thread_count,
in.data(), combined_dims, out.data())); in.data(), combined_dims, out.data()));
} else if (src_filter_format == FORMAT_OHWI) { } else if (src_filter_format == FORMAT_OHWI) {
combined_dims[0] = in.dimension(0); // output filters combined_dims[0] = in.dimension(0); // output filters
@ -485,11 +491,11 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
} }
combined_dims[2] = in.dimension(NDIMS - 1); // input filters combined_dims[2] = in.dimension(NDIMS - 1); // input filters
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d); GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 0, 1>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 0, 1>,
config.block_count, config.thread_per_block, config.block_count, config.thread_per_block,
0, d.stream(), config.virtual_thread_count, 0, d.stream(), config.virtual_thread_count,
in.data(), combined_dims, out.data())); in.data(), combined_dims, out.data()));
} else { } else {
// TODO(ezhulenev): Set error status in OpKernelContext instead. // TODO(ezhulenev): Set error status in OpKernelContext instead.
@ -510,7 +516,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
const std::array<int, NDIMS - 2>& padding_right, const std::array<int, NDIMS - 2>& padding_right,
typename TTypes<T, NDIMS, int>::Tensor out, typename TTypes<T, NDIMS, int>::Tensor out,
TensorFormat format) { TensorFormat format) {
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d); GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
Dimension<NDIMS> input_dims; Dimension<NDIMS> input_dims;
for (int i = 0; i < NDIMS; ++i) { for (int i = 0; i < NDIMS; ++i) {
input_dims[i] = in.dimension(i); input_dims[i] = in.dimension(i);
@ -523,12 +529,12 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
const Dimension<NDIMS - 2> padding_left_dim(padding_left); const Dimension<NDIMS - 2> padding_left_dim(padding_left);
if (format == FORMAT_NHWC) { if (format == FORMAT_NHWC) {
TF_CHECK_OK(CudaLaunchKernel( TF_CHECK_OK(GpuLaunchKernel(
PadInputCustomKernelNHWC<T, NDIMS>, config.block_count, PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count, config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
in.data(), input_dims, out.data(), output_dims, padding_left_dim)); in.data(), input_dims, out.data(), output_dims, padding_left_dim));
} else if (format == FORMAT_NCHW) { } else if (format == FORMAT_NCHW) {
TF_CHECK_OK(CudaLaunchKernel( TF_CHECK_OK(GpuLaunchKernel(
PadInputCustomKernelNCHW<T, NDIMS>, config.block_count, PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count, config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
in.data(), input_dims, out.data(), output_dims, padding_left_dim)); in.data(), input_dims, out.data(), output_dims, padding_left_dim));
@ -640,13 +646,13 @@ void LaunchBatchNarrowMatrixTransposeKernel(
const T* input, const Dimension<3>& input_dims, T* output) { const T* input, const Dimension<3>& input_dims, T* output) {
constexpr int NumThreads = TileLongSide; constexpr int NumThreads = TileLongSide;
if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) { if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
TF_CHECK_OK(CudaLaunchKernel( TF_CHECK_OK(GpuLaunchKernel(
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide, SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
TileShortSide>, TileShortSide>,
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims, total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
output)); output));
} else { } else {
TF_CHECK_OK(CudaLaunchKernel( TF_CHECK_OK(GpuLaunchKernel(
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide, SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
TileLongSide>, TileLongSide>,
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims, total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
@ -951,8 +957,7 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] * int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
input_dims_in_tiles[2]; input_dims_in_tiles[2];
TF_CHECK_OK(GpuLaunchKernel(
TF_CHECK_OK(CudaLaunchKernel(
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize,
kTileSize, conjugate>, kTileSize, conjugate>,
total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims, total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims,
@ -963,11 +968,11 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
d, input, input_dims, output, kMinDimensionToUseTiles); d, input, input_dims, output, kMinDimensionToUseTiles);
} else { } else {
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2]; int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
GpuLaunchConfig config = GetCudaLaunchConfig(total_element_count, d); GpuLaunchConfig config = GetGpuLaunchConfig(total_element_count, d);
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
config.block_count, config.thread_per_block, 0, config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, input, d.stream(), config.virtual_thread_count, input,
input_dims, output)); input_dims, output));
} }
} }
@ -996,11 +1001,11 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
static_cast<int>(combined_dims[1]), static_cast<int>(combined_dims[1]),
static_cast<int>(combined_dims[2])}; static_cast<int>(combined_dims[2])};
size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
GpuLaunchConfig config = GetCudaLaunchConfig(total_size, d); GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d);
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>, TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
config.block_count, config.thread_per_block, 0, config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, in, d.stream(), config.virtual_thread_count, in,
input_dims, out)); input_dims, out));
} }
}; };
@ -1043,6 +1048,6 @@ struct NCHWToNHWC<GPUDevice, T, NDIMS> {
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#endif // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_ #endif // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -47,4 +47,4 @@ template struct PadInput<Eigen::GpuDevice, double, int, 5>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -60,4 +60,4 @@ template struct PadInput<Eigen::GpuDevice, float, int, 5>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -54,4 +54,4 @@ template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#if GOOGLE_CUDA #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
} // namespace functor } // namespace functor
} // namespace tensorflow } // namespace tensorflow
#endif // GOOGLE_CUDA #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM