Add ROCm support for pooling operators

max pooling for qint8 is disabled on ROCm for now
This commit is contained in:
Wen-Heng (Jack) Chung 2019-06-28 17:10:51 +00:00
parent f0ae8bef6a
commit 4f9e60968e
11 changed files with 84 additions and 68 deletions

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -48,7 +48,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
const int kernel_w, const int stride_h,
const int stride_w, const int pad_t,
const int pad_l, dtype* const bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// find out the local index
// find out the local offset
const int c = index % channels;
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
const int pad_l, T* const bottom_diff,
const GPUDevice& d) {
int x_size = num * height * width * channels;
GpuLaunchConfig config = GetCudaLaunchConfig(x_size, d);
TF_CHECK_OK(CudaLaunchKernel(
GpuLaunchConfig config = GetGpuLaunchConfig(x_size, d);
TF_CHECK_OK(GpuLaunchKernel(
AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
d.stream(), config.virtual_thread_count, top_diff, num, height, width,
channels, pooled_height, pooled_width, kernel_h, kernel_w, stride_h,
@ -121,4 +121,4 @@ template bool RunAvePoolBackwardNHWC(
} // end namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -608,6 +608,7 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
se::dnn::AlgorithmConfig>
AutoTuneConv;
#if GOOGLE_CUDA
// Check the passed allocator for redzone violations.
// If violations have occurred, mark the corresponding autotune result
// as a failure.
@ -646,6 +647,7 @@ static void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
}
}
#endif
template <typename T>
void LaunchConv2DOp<GPUDevice, T>::operator()(

View File

@ -28,7 +28,7 @@ typedef Eigen::GpuDevice GPUDevice;
namespace tensorflow {
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
@ -103,7 +103,7 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
output_desc, &output_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolForward launch failed"));
errors::Internal("dnn PoolForward launch failed"));
if (data_format == FORMAT_NHWC) {
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
@ -232,7 +232,7 @@ void DnnPooling3dGradOp<T>::Compute(
output_backprop_data, &input_backprop_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolBackward launch failed"));
errors::Internal("dnn PoolBackward launch failed"));
if (data_format == FORMAT_NHWC) {
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
@ -249,6 +249,6 @@ void DnnPooling3dGradOp<T>::Compute(
TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
#undef DEFINE_DNN_OPS
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow

View File

@ -22,7 +22,7 @@ limitations under the License.
#include "tensorflow/core/framework/op_kernel.h"
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/platform/stream_executor.h"
#endif
@ -30,7 +30,7 @@ limitations under the License.
namespace tensorflow {
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Runs (avg/max)pooling on GPU.
// Dimension order for all array arguments is: x, y, z.

View File

@ -42,10 +42,12 @@ limitations under the License.
#if GOOGLE_CUDA
#include "third_party/gpus/cudnn/cudnn.h"
#endif // GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
namespace tensorflow {
@ -317,7 +319,7 @@ class MaxPoolingGradOp : public OpKernel {
TensorFormat data_format_;
};
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
static void MaxPoolingBackwardCustomKernel(
@ -438,7 +440,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
bool propagate_nans_;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// The operation to compute gradient of MaxPool gradients.
// It takes three inputs:
@ -647,7 +649,7 @@ class MaxPoolingGradGradOp : public OpKernel {
TensorFormat data_format_;
};
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <class T>
class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
@ -744,7 +746,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
bool use_dnn_;
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename Device, typename T>
struct LaunchMaxPoolingNoMask;
@ -1112,7 +1114,7 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
bool include_batch_in_index_;
};
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
public:
@ -1383,7 +1385,7 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
}
};
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER_MAX_POOL_KERNELS(D, T) \
REGISTER_KERNEL_BUILDER( \
@ -1430,7 +1432,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
#undef REGISTER_CPU_KERNELS
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// Forward declarations for the functor specializations for GPU.
namespace functor {
@ -1509,7 +1511,7 @@ REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
#undef REGISTER_GPU_ONLY_POOL_KERNELS
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#undef REGISTER_MAX_POOL_KERNELS

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
dtype* top_data, int64* mask, const bool include_batch_in_index) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
@ -101,6 +101,7 @@ __global__ void MaxPoolForwardNCHW(
}
}
#if GOOGLE_CUDA
// The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for
// MaxPoolForwardNCHW above, except that mask is not supported, and each
// element of the input and output contains 4 adjacent channel values for
@ -114,7 +115,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
int32* top_data) {
// TODO(pauldonnelly): Implement a better optimized version of this kernel.
const int32 kMinINT8X4 = 0x80808080;
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
@ -136,6 +137,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
top_data[index] = maxval;
}
}
#endif // GOOGLE_CUDA
template <bool propagate_nans, typename dtype>
__global__ void MaxPoolForwardNHWC(
@ -144,7 +146,7 @@ __global__ void MaxPoolForwardNHWC(
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
dtype* top_data, int64* mask, const bool include_batch_in_index) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
int n = index;
int c = n % channels;
n /= channels;
@ -183,7 +185,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
const dtype* top_diff, dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// First find out the index to the maximum, since we have no mask.
int n = index;
int c = n % channels;
@ -212,7 +214,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
// Atomically accumulate the bottom diff. The index could still be
// uninitialized, if all the bottom_data are NaN.
if (maxidx != -1) {
CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
top_diff[index]);
}
}
@ -234,7 +236,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
// bottom_diff: the gradient with respect to the input.
// include_batch_in_index: whether to include batch dimension in flattened
// index of `argmax`.
// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
// This function relies on GpuAtomicAdd to avoid race conditions. Also, before
// the kernel is run, you will need to make sure that bottom_diff is filled with
// zero first.
template <typename dtype>
@ -242,10 +244,10 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
const int64* mask, const int top_offset,
const int bottom_offset, dtype* bottom_diff,
const bool include_batch_in_index) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
const int offset =
include_batch_in_index ? 0 : (index / top_offset) * bottom_offset;
CudaAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
GpuAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
}
}
@ -270,7 +272,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCHW(
const int height, const int width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
const dtype* top_diff, dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// First find out the index to the maximum, since we have no mask.
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
@ -310,7 +312,7 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
const int height, const int width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
const dtype* top_diff, dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// First find out the index to the maximum, since we have no mask.
int n = index;
int c = n % channels;
@ -369,18 +371,19 @@ __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
const int64* mask, const int top_offset,
const int bottom_offset, dtype* bottom_diff,
const bool include_batch_in_index) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
const int offset =
include_batch_in_index ? 0 : (index / bottom_offset) * top_offset;
bottom_diff[index] = top_diff[offset + mask[index]];
}
}
#undef CUDA_1D_KERNEL_LOOP
#undef GPU_1D_KERNEL_LOOP
} // namespace
namespace functor {
#if GOOGLE_CUDA
// Note: channels is the outer channels (dim 1) which has already been
// divided by 4.
bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
@ -392,7 +395,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
const int kThreadsPerBlock = 1024;
const int output_size = batch * channels * pooled_height * pooled_width;
if (output_size == 0) return true;
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolForwardNoMaskKernel_NCHW_VECT_C,
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
0, d.stream(), output_size, bottom_data, height, width, channels,
@ -400,6 +403,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
pad_t, pad_l, top_data));
return d.ok();
}
#endif // GOOGLE_CUDA
template <typename T>
bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
@ -413,7 +417,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
const int output_size = batch * channels * pooled_height * pooled_width;
if (output_size == 0) return true;
if (propagate_nans) {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolForwardNHWC<true, T>,
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
@ -421,7 +425,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
stride_h, stride_w, pad_t, pad_l, top_data, mask,
include_batch_in_index));
} else {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolForwardNHWC<false, T>,
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
@ -443,12 +447,12 @@ bool MaxPoolBackwardNoMask<T>::operator()(
const int bottom_size = batch * channels * height * width;
if (bottom_size == 0) return true;
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));
const int top_size = batch * channels * pooled_height * pooled_width;
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolBackwardNoMaskNHWC<T>,
(top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
@ -465,10 +469,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
const bool include_batch_in_index) {
const int kThreadsPerBlock = 1024;
if (input_size == 0) return true;
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
SetZero<T>, (input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
kThreadsPerBlock, 0, d.stream(), input_size, bottom_diff));
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolBackward<T>,
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
0, d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
@ -486,18 +490,18 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
const Eigen::GpuDevice& d) {
const int num_kernels = batch * channels * pooled_height * pooled_width;
if (num_kernels == 0) return true;
GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
if (data_format == FORMAT_NHWC) {
TF_CHECK_OK(
CudaLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
GpuLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
config.thread_per_block, 0, d.stream(), num_kernels,
bottom_data, output_data, pooled_height, pooled_width,
channels, height, width, kernel_h, kernel_w, stride_h,
stride_w, pad_t, pad_l, top_diff, bottom_diff));
} else {
TF_CHECK_OK(
CudaLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
GpuLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
config.thread_per_block, 0, d.stream(), num_kernels,
bottom_data, output_data, pooled_height, pooled_width,
channels, height, width, kernel_h, kernel_w, stride_h,
@ -513,8 +517,8 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
T* bottom_diff, const Eigen::GpuDevice& d,
const bool include_batch_in_index) {
if (input_size == 0) return true;
GpuLaunchConfig config = GetCudaLaunchConfig(output_size, d);
TF_CHECK_OK(CudaLaunchKernel(
GpuLaunchConfig config = GetGpuLaunchConfig(output_size, d);
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolGradBackward<T>, config.block_count, config.thread_per_block, 0,
d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
bottom_diff, include_batch_in_index));
@ -539,4 +543,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
} // end namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -33,7 +33,7 @@ limitations under the License.
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/util/work_sharder.h"
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
#endif
@ -738,7 +738,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
TF_CALL_float(REGISTER_CPU_KERNELS);
#undef REGISTER_CPU_KERNELS
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
struct LaunchPoolingOp<GPUDevice, T, AVG> {
@ -826,7 +826,7 @@ struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
#undef REGISTER_GPU_KERNELS
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -35,7 +35,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCDHW(
const int stride_p, const int stride_h, const int stride_w, const int pad_p,
const int pad_t, const int pad_l, const dtype* top_diff,
dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// First find out the index to the maximum, since we have no mask.
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
@ -85,7 +85,7 @@ __global__ void MaxPoolGradBackwardNoMaskNDHWC(
const int stride_p, const int stride_h, const int stride_w, const int pad_p,
const int pad_t, const int pad_l, const dtype* top_diff,
dtype* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
// First find out the index to the maximum, since we have no mask.
int n = index;
int c = n % channels;
@ -142,16 +142,16 @@ bool MaxPool3dGradBackward<T>::operator()(
const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
int num_kernels =
batch * channels * pooled_plane * pooled_height * pooled_width;
GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
if (data_format == FORMAT_NHWC) {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolGradBackwardNoMaskNDHWC<T>, config.block_count,
config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
height, width, kernel_p, kernel_h, kernel_w, stride_p, stride_h,
stride_w, pad_p, pad_t, pad_l, top_diff, bottom_diff));
} else {
TF_CHECK_OK(CudaLaunchKernel(
TF_CHECK_OK(GpuLaunchKernel(
MaxPoolGradBackwardNoMaskNCDHW<T>, config.block_count,
config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
@ -169,4 +169,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if !GOOGLE_CUDA
#error This file must only be included when building with Cuda support
#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
#error This file must only be included when building with Cuda or ROCm support
#endif
#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_

View File

@ -22,10 +22,12 @@ limitations under the License.
#if GOOGLE_CUDA
#include "third_party/gpus/cudnn/cudnn.h"
#endif // GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
namespace tensorflow {
@ -125,7 +127,7 @@ TensorShape PoolParameters::forward_output_shape() {
}
}
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
namespace {
template <typename T>
@ -267,7 +269,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
output_desc, &output_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolForward launch failed"));
errors::Internal("dnn PoolForward launch failed"));
#if CUDNN_VERSION < 7300
if (data_format == FORMAT_NHWC) {
/// Transform the output data from NCHW back to NHWC
@ -420,7 +422,7 @@ void DnnPoolingGradOp<T>::Compute(
output_backprop_data, &input_backprop_data)
.ok();
OP_REQUIRES(context, status,
errors::Internal("cudnn PoolBackward launch failed"));
errors::Internal("dnn PoolBackward launch failed"));
if (data_format == FORMAT_NHWC) {
/// Transform the output data from NCHW back to NHWC.
@ -443,6 +445,6 @@ template class DnnPoolingOp<qint8>;
#undef DEFINE_DNN_OPS
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // namespace tensorflow

View File

@ -29,9 +29,9 @@ limitations under the License.
#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/util/work_sharder.h"
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
namespace tensorflow {
@ -269,11 +269,12 @@ class MaxPoolingOp : public OpKernel {
template <typename Device>
struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <>
struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
static void launch(OpKernelContext* context, const PoolParameters& params,
const Tensor& input, Tensor* output) {
#if GOOGLE_CUDA
bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
reinterpret_cast<const int32*>(input.flat<qint8>().data()),
params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
@ -286,9 +287,14 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
context->SetStatus(errors::Internal(
"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
}
#else
// ROCm TODO: add support __vmaxs4 on ROCm
context->SetStatus(errors::Internal(
"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
#endif // GOOGLE_CUDA
}
};
#endif
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename Device, typename T>
class MaxPoolingV2Op : public OpKernel {
@ -405,7 +411,7 @@ class MaxPoolingV2Op : public OpKernel {
// Spatial MaxPooling implementation.
//
// TODO(vrv): Remove this once we no longer need it.
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
if (std::is_same<Device, GPUDevice>::value) {
Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
if (std::is_same<T, qint8>::value) {