Add ROCm support for pooling operators
max pooling for qint8 is disabled on ROCm for now
This commit is contained in:
parent
f0ae8bef6a
commit
4f9e60968e
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -48,7 +48,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
|
||||
const int kernel_w, const int stride_h,
|
||||
const int stride_w, const int pad_t,
|
||||
const int pad_l, dtype* const bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local index
|
||||
// find out the local offset
|
||||
const int c = index % channels;
|
||||
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
|
||||
const int pad_l, T* const bottom_diff,
|
||||
const GPUDevice& d) {
|
||||
int x_size = num * height * width * channels;
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(x_size, d);
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(x_size, d);
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
|
||||
d.stream(), config.virtual_thread_count, top_diff, num, height, width,
|
||||
channels, pooled_height, pooled_width, kernel_h, kernel_w, stride_h,
|
||||
@ -121,4 +121,4 @@ template bool RunAvePoolBackwardNHWC(
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -608,6 +608,7 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
|
||||
se::dnn::AlgorithmConfig>
|
||||
AutoTuneConv;
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
// Check the passed allocator for redzone violations.
|
||||
// If violations have occurred, mark the corresponding autotune result
|
||||
// as a failure.
|
||||
@ -646,6 +647,7 @@ static void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
|
||||
LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
void LaunchConv2DOp<GPUDevice, T>::operator()(
|
||||
|
@ -28,7 +28,7 @@ typedef Eigen::GpuDevice GPUDevice;
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <typename T>
|
||||
void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
|
||||
@ -103,7 +103,7 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
|
||||
output_desc, &output_data)
|
||||
.ok();
|
||||
OP_REQUIRES(context, status,
|
||||
errors::Internal("cudnn PoolForward launch failed"));
|
||||
errors::Internal("dnn PoolForward launch failed"));
|
||||
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
|
||||
@ -232,7 +232,7 @@ void DnnPooling3dGradOp<T>::Compute(
|
||||
output_backprop_data, &input_backprop_data)
|
||||
.ok();
|
||||
OP_REQUIRES(context, status,
|
||||
errors::Internal("cudnn PoolBackward launch failed"));
|
||||
errors::Internal("dnn PoolBackward launch failed"));
|
||||
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
|
||||
@ -249,6 +249,6 @@ void DnnPooling3dGradOp<T>::Compute(
|
||||
TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
|
||||
#undef DEFINE_DNN_OPS
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -22,7 +22,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#endif
|
||||
|
||||
@ -30,7 +30,7 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
// Runs (avg/max)pooling on GPU.
|
||||
// Dimension order for all array arguments is: x, y, z.
|
||||
|
@ -42,10 +42,12 @@ limitations under the License.
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "third_party/gpus/cudnn/cudnn.h"
|
||||
#endif // GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
|
||||
#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
@ -317,7 +319,7 @@ class MaxPoolingGradOp : public OpKernel {
|
||||
TensorFormat data_format_;
|
||||
};
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <typename T>
|
||||
static void MaxPoolingBackwardCustomKernel(
|
||||
@ -438,7 +440,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
|
||||
bool propagate_nans_;
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
// The operation to compute gradient of MaxPool gradients.
|
||||
// It takes three inputs:
|
||||
@ -647,7 +649,7 @@ class MaxPoolingGradGradOp : public OpKernel {
|
||||
TensorFormat data_format_;
|
||||
};
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <class T>
|
||||
class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
|
||||
@ -744,7 +746,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
|
||||
bool use_dnn_;
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <typename Device, typename T>
|
||||
struct LaunchMaxPoolingNoMask;
|
||||
@ -1112,7 +1114,7 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
|
||||
bool include_batch_in_index_;
|
||||
};
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
template <typename T>
|
||||
class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
|
||||
public:
|
||||
@ -1383,7 +1385,7 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define REGISTER_MAX_POOL_KERNELS(D, T) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
@ -1430,7 +1432,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
|
||||
TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
|
||||
#undef REGISTER_CPU_KERNELS
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
// Forward declarations for the functor specializations for GPU.
|
||||
namespace functor {
|
||||
@ -1509,7 +1511,7 @@ REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")
|
||||
|
||||
#undef REGISTER_GPU_ONLY_POOL_KERNELS
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#undef REGISTER_MAX_POOL_KERNELS
|
||||
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(
|
||||
const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
|
||||
dtype* top_data, int64* mask, const bool include_batch_in_index) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
@ -101,6 +101,7 @@ __global__ void MaxPoolForwardNCHW(
|
||||
}
|
||||
}
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
// The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for
|
||||
// MaxPoolForwardNCHW above, except that mask is not supported, and each
|
||||
// element of the input and output contains 4 adjacent channel values for
|
||||
@ -114,7 +115,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
|
||||
int32* top_data) {
|
||||
// TODO(pauldonnelly): Implement a better optimized version of this kernel.
|
||||
const int32 kMinINT8X4 = 0x80808080;
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
@ -136,6 +137,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
|
||||
top_data[index] = maxval;
|
||||
}
|
||||
}
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
template <bool propagate_nans, typename dtype>
|
||||
__global__ void MaxPoolForwardNHWC(
|
||||
@ -144,7 +146,7 @@ __global__ void MaxPoolForwardNHWC(
|
||||
const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
|
||||
dtype* top_data, int64* mask, const bool include_batch_in_index) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int n = index;
|
||||
int c = n % channels;
|
||||
n /= channels;
|
||||
@ -183,7 +185,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
|
||||
const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
|
||||
const dtype* top_diff, dtype* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// First find out the index to the maximum, since we have no mask.
|
||||
int n = index;
|
||||
int c = n % channels;
|
||||
@ -212,7 +214,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
|
||||
// Atomically accumulate the bottom diff. The index could still be
|
||||
// uninitialized, if all the bottom_data are NaN.
|
||||
if (maxidx != -1) {
|
||||
CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
|
||||
GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
|
||||
top_diff[index]);
|
||||
}
|
||||
}
|
||||
@ -234,7 +236,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
|
||||
// bottom_diff: the gradient with respect to the input.
|
||||
// include_batch_in_index: whether to include batch dimension in flattened
|
||||
// index of `argmax`.
|
||||
// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
|
||||
// This function relies on GpuAtomicAdd to avoid race conditions. Also, before
|
||||
// the kernel is run, you will need to make sure that bottom_diff is filled with
|
||||
// zero first.
|
||||
template <typename dtype>
|
||||
@ -242,10 +244,10 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
|
||||
const int64* mask, const int top_offset,
|
||||
const int bottom_offset, dtype* bottom_diff,
|
||||
const bool include_batch_in_index) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
const int offset =
|
||||
include_batch_in_index ? 0 : (index / top_offset) * bottom_offset;
|
||||
CudaAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
|
||||
GpuAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -270,7 +272,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCHW(
|
||||
const int height, const int width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
|
||||
const dtype* top_diff, dtype* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// First find out the index to the maximum, since we have no mask.
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
@ -310,7 +312,7 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
|
||||
const int height, const int width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l,
|
||||
const dtype* top_diff, dtype* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// First find out the index to the maximum, since we have no mask.
|
||||
int n = index;
|
||||
int c = n % channels;
|
||||
@ -369,18 +371,19 @@ __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
|
||||
const int64* mask, const int top_offset,
|
||||
const int bottom_offset, dtype* bottom_diff,
|
||||
const bool include_batch_in_index) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
const int offset =
|
||||
include_batch_in_index ? 0 : (index / bottom_offset) * top_offset;
|
||||
bottom_diff[index] = top_diff[offset + mask[index]];
|
||||
}
|
||||
}
|
||||
|
||||
#undef CUDA_1D_KERNEL_LOOP
|
||||
#undef GPU_1D_KERNEL_LOOP
|
||||
} // namespace
|
||||
|
||||
namespace functor {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
// Note: channels is the outer channels (dim 1) which has already been
|
||||
// divided by 4.
|
||||
bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
|
||||
@ -392,7 +395,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
|
||||
const int kThreadsPerBlock = 1024;
|
||||
const int output_size = batch * channels * pooled_height * pooled_width;
|
||||
if (output_size == 0) return true;
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolForwardNoMaskKernel_NCHW_VECT_C,
|
||||
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
|
||||
0, d.stream(), output_size, bottom_data, height, width, channels,
|
||||
@ -400,6 +403,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
|
||||
pad_t, pad_l, top_data));
|
||||
return d.ok();
|
||||
}
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
template <typename T>
|
||||
bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
|
||||
@ -413,7 +417,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
|
||||
const int output_size = batch * channels * pooled_height * pooled_width;
|
||||
if (output_size == 0) return true;
|
||||
if (propagate_nans) {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolForwardNHWC<true, T>,
|
||||
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
|
||||
kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
|
||||
@ -421,7 +425,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
|
||||
stride_h, stride_w, pad_t, pad_l, top_data, mask,
|
||||
include_batch_in_index));
|
||||
} else {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolForwardNHWC<false, T>,
|
||||
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
|
||||
kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
|
||||
@ -443,12 +447,12 @@ bool MaxPoolBackwardNoMask<T>::operator()(
|
||||
|
||||
const int bottom_size = batch * channels * height * width;
|
||||
if (bottom_size == 0) return true;
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
|
||||
kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));
|
||||
|
||||
const int top_size = batch * channels * pooled_height * pooled_width;
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolBackwardNoMaskNHWC<T>,
|
||||
(top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
|
||||
d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
|
||||
@ -465,10 +469,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
|
||||
const bool include_batch_in_index) {
|
||||
const int kThreadsPerBlock = 1024;
|
||||
if (input_size == 0) return true;
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
SetZero<T>, (input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
|
||||
kThreadsPerBlock, 0, d.stream(), input_size, bottom_diff));
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolBackward<T>,
|
||||
(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
|
||||
0, d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
|
||||
@ -486,18 +490,18 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
|
||||
const Eigen::GpuDevice& d) {
|
||||
const int num_kernels = batch * channels * pooled_height * pooled_width;
|
||||
if (num_kernels == 0) return true;
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
|
||||
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
TF_CHECK_OK(
|
||||
CudaLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
|
||||
GpuLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), num_kernels,
|
||||
bottom_data, output_data, pooled_height, pooled_width,
|
||||
channels, height, width, kernel_h, kernel_w, stride_h,
|
||||
stride_w, pad_t, pad_l, top_diff, bottom_diff));
|
||||
} else {
|
||||
TF_CHECK_OK(
|
||||
CudaLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
|
||||
GpuLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), num_kernels,
|
||||
bottom_data, output_data, pooled_height, pooled_width,
|
||||
channels, height, width, kernel_h, kernel_w, stride_h,
|
||||
@ -513,8 +517,8 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
|
||||
T* bottom_diff, const Eigen::GpuDevice& d,
|
||||
const bool include_batch_in_index) {
|
||||
if (input_size == 0) return true;
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(output_size, d);
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(output_size, d);
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolGradBackward<T>, config.block_count, config.thread_per_block, 0,
|
||||
d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
|
||||
bottom_diff, include_batch_in_index));
|
||||
@ -539,4 +543,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -33,7 +33,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/util/tensor_format.h"
|
||||
#include "tensorflow/core/util/work_sharder.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
|
||||
#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
|
||||
#endif
|
||||
@ -738,7 +738,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
|
||||
TF_CALL_float(REGISTER_CPU_KERNELS);
|
||||
#undef REGISTER_CPU_KERNELS
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <typename T>
|
||||
struct LaunchPoolingOp<GPUDevice, T, AVG> {
|
||||
@ -826,7 +826,7 @@ struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
|
||||
TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
|
||||
#undef REGISTER_GPU_KERNELS
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -35,7 +35,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCDHW(
|
||||
const int stride_p, const int stride_h, const int stride_w, const int pad_p,
|
||||
const int pad_t, const int pad_l, const dtype* top_diff,
|
||||
dtype* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// First find out the index to the maximum, since we have no mask.
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
@ -85,7 +85,7 @@ __global__ void MaxPoolGradBackwardNoMaskNDHWC(
|
||||
const int stride_p, const int stride_h, const int stride_w, const int pad_p,
|
||||
const int pad_t, const int pad_l, const dtype* top_diff,
|
||||
dtype* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// First find out the index to the maximum, since we have no mask.
|
||||
int n = index;
|
||||
int c = n % channels;
|
||||
@ -142,16 +142,16 @@ bool MaxPool3dGradBackward<T>::operator()(
|
||||
const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
|
||||
int num_kernels =
|
||||
batch * channels * pooled_plane * pooled_height * pooled_width;
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolGradBackwardNoMaskNDHWC<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
|
||||
output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
|
||||
height, width, kernel_p, kernel_h, kernel_w, stride_p, stride_h,
|
||||
stride_w, pad_p, pad_t, pad_l, top_diff, bottom_diff));
|
||||
} else {
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
MaxPoolGradBackwardNoMaskNCDHW<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
|
||||
output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
|
||||
@ -169,4 +169,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if !GOOGLE_CUDA
|
||||
#error This file must only be included when building with Cuda support
|
||||
#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
|
||||
#error This file must only be included when building with Cuda or ROCm support
|
||||
#endif
|
||||
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
|
||||
|
@ -22,10 +22,12 @@ limitations under the License.
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "third_party/gpus/cudnn/cudnn.h"
|
||||
#endif // GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/conv_2d.h"
|
||||
#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
@ -125,7 +127,7 @@ TensorShape PoolParameters::forward_output_shape() {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
@ -267,7 +269,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
|
||||
output_desc, &output_data)
|
||||
.ok();
|
||||
OP_REQUIRES(context, status,
|
||||
errors::Internal("cudnn PoolForward launch failed"));
|
||||
errors::Internal("dnn PoolForward launch failed"));
|
||||
#if CUDNN_VERSION < 7300
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
/// Transform the output data from NCHW back to NHWC
|
||||
@ -420,7 +422,7 @@ void DnnPoolingGradOp<T>::Compute(
|
||||
output_backprop_data, &input_backprop_data)
|
||||
.ok();
|
||||
OP_REQUIRES(context, status,
|
||||
errors::Internal("cudnn PoolBackward launch failed"));
|
||||
errors::Internal("dnn PoolBackward launch failed"));
|
||||
|
||||
if (data_format == FORMAT_NHWC) {
|
||||
/// Transform the output data from NCHW back to NHWC.
|
||||
@ -443,6 +445,6 @@ template class DnnPoolingOp<qint8>;
|
||||
|
||||
#undef DEFINE_DNN_OPS
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -29,9 +29,9 @@ limitations under the License.
|
||||
#include "tensorflow/core/util/tensor_format.h"
|
||||
#include "tensorflow/core/util/work_sharder.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
@ -269,11 +269,12 @@ class MaxPoolingOp : public OpKernel {
|
||||
template <typename Device>
|
||||
struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
template <>
|
||||
struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
|
||||
static void launch(OpKernelContext* context, const PoolParameters& params,
|
||||
const Tensor& input, Tensor* output) {
|
||||
#if GOOGLE_CUDA
|
||||
bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
|
||||
reinterpret_cast<const int32*>(input.flat<qint8>().data()),
|
||||
params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
|
||||
@ -286,9 +287,14 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
|
||||
context->SetStatus(errors::Internal(
|
||||
"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
|
||||
}
|
||||
#else
|
||||
// ROCm TODO: add support __vmaxs4 on ROCm
|
||||
context->SetStatus(errors::Internal(
|
||||
"Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
|
||||
#endif // GOOGLE_CUDA
|
||||
}
|
||||
};
|
||||
#endif
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
template <typename Device, typename T>
|
||||
class MaxPoolingV2Op : public OpKernel {
|
||||
@ -405,7 +411,7 @@ class MaxPoolingV2Op : public OpKernel {
|
||||
// Spatial MaxPooling implementation.
|
||||
//
|
||||
// TODO(vrv): Remove this once we no longer need it.
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
if (std::is_same<Device, GPUDevice>::value) {
|
||||
Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
|
||||
if (std::is_same<T, qint8>::value) {
|
||||
|
Loading…
Reference in New Issue
Block a user