Add ROCm support for remaining cwise ops and tests
cwise_ops.h -- packet access is always false for ROCm. Some ROCm ops do not yet support complex64 / complex128 types.
This commit is contained in:
parent
08dfb5c747
commit
83a0c0e427
@ -181,7 +181,7 @@ REGISTER_CPU_KERNEL(uint8);
|
||||
REGISTER_CPU_KERNEL(uint16);
|
||||
#undef REGISTER_CPU_KERNEL
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define REGISTER_GPU_KERNEL(type) \
|
||||
REGISTER_KERNEL_BUILDER( \
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -26,7 +26,7 @@ namespace tensorflow {
|
||||
template <typename T>
|
||||
__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
|
||||
const T *in1, const T *in2, T *out) {
|
||||
CUDA_1D_KERNEL_LOOP(i, size_in) {
|
||||
GPU_1D_KERNEL_LOOP(i, size_in) {
|
||||
T value = in2[0] < in0[i] ? in2[0] : in0[i];
|
||||
out[i] = value < in1[0] ? in1[0] : value;
|
||||
}
|
||||
@ -36,7 +36,7 @@ template <typename T>
|
||||
__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
|
||||
const T *in1, const T *in2,
|
||||
T *out) {
|
||||
CUDA_1D_KERNEL_LOOP(i, size_in) {
|
||||
GPU_1D_KERNEL_LOOP(i, size_in) {
|
||||
T value = in2[i] < in0[i] ? in2[i] : in0[i];
|
||||
out[i] = value < in1[0] ? in1[0] : value;
|
||||
}
|
||||
@ -45,7 +45,7 @@ __global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
|
||||
template <typename T>
|
||||
__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
|
||||
const T *in1, const T *in2, T *out) {
|
||||
CUDA_1D_KERNEL_LOOP(i, size_in) {
|
||||
GPU_1D_KERNEL_LOOP(i, size_in) {
|
||||
T value = in2[0] < in0[i] ? in2[0] : in0[i];
|
||||
out[i] = value < in1[i] ? in1[i] : value;
|
||||
}
|
||||
@ -60,9 +60,9 @@ struct UnaryClipOp<GPUDevice, T> {
|
||||
typename TTypes<T>::ConstFlat &in1_flat,
|
||||
typename TTypes<T>::ConstFlat &in2_flat,
|
||||
typename TTypes<T>::Flat &out_flat) const {
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(in0_flat.size(), d);
|
||||
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
UnaryClipCustomKernel<T>, config.block_count, config.thread_per_block,
|
||||
0, d.stream(), in0_flat.size(), in0_flat.data(), in1_flat.data(),
|
||||
in2_flat.data(), out_flat.data()));
|
||||
@ -76,9 +76,9 @@ struct BinaryRightClipOp<GPUDevice, T> {
|
||||
typename TTypes<T>::ConstFlat &in1_flat,
|
||||
typename TTypes<T>::ConstFlat &in2_flat,
|
||||
typename TTypes<T>::Flat &out_flat) const {
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(in0_flat.size(), d);
|
||||
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
BinaryRightClipCustomKernel<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), in0_flat.size(),
|
||||
in0_flat.data(), in1_flat.data(), in2_flat.data(), out_flat.data()));
|
||||
@ -92,9 +92,9 @@ struct BinaryLeftClipOp<GPUDevice, T> {
|
||||
typename TTypes<T>::ConstFlat &in1_flat,
|
||||
typename TTypes<T>::ConstFlat &in2_flat,
|
||||
typename TTypes<T>::Flat &out_flat) const {
|
||||
GpuLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
|
||||
GpuLaunchConfig config = GetGpuLaunchConfig(in0_flat.size(), d);
|
||||
|
||||
TF_CHECK_OK(CudaLaunchKernel(
|
||||
TF_CHECK_OK(GpuLaunchKernel(
|
||||
BinaryLeftClipCustomKernel<T>, config.block_count,
|
||||
config.thread_per_block, 0, d.stream(), in0_flat.size(),
|
||||
in0_flat.data(), in1_flat.data(), in2_flat.data(), out_flat.data()));
|
||||
@ -131,4 +131,4 @@ INSTANTIATE_GPU(uint16);
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -27,15 +27,32 @@ REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
|
||||
REGISTER5(BinaryOp, CPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
|
||||
double, complex64, complex128);
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
// ROCM TODO: re-enable complex64 / complex128 after compiler fix
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER9(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
|
||||
uint16, int16, int64, complex64, complex128);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
REGISTER7(BinaryOp, GPU, "Div", functor::div, float, Eigen::half, double, uint8,
|
||||
uint16, int16, int64);
|
||||
#endif
|
||||
REGISTER4(BinaryOp, GPU, "TruncateDiv", functor::div, uint8, uint16, int16,
|
||||
int64);
|
||||
// ROCM TODO: re-enable complex64 / complex128 after compiler fix
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER5(BinaryOp, GPU, "RealDiv", functor::div, float, Eigen::half, double,
|
||||
complex64, complex128);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
REGISTER3(BinaryOp, GPU, "RealDiv", functor::div, float, Eigen::half, double);
|
||||
#endif
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER5(BinaryOp, GPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
|
||||
double, complex64, complex128);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
REGISTER3(BinaryOp, GPU, "DivNoNan", functor::div_no_nan, Eigen::half, float,
|
||||
double);
|
||||
#endif
|
||||
|
||||
// A special GPU kernel for int32.
|
||||
// TODO(b/25387198): Also enable int32 in device memory. This kernel
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
|
||||
|
||||
@ -21,8 +21,13 @@ namespace tensorflow {
|
||||
namespace functor {
|
||||
DEFINE_BINARY10(div, Eigen::half, float, double, uint8, uint16, int16, int32,
|
||||
int64, complex64, complex128);
|
||||
#if GOOGLE_CUDA
|
||||
DEFINE_BINARY5(div_no_nan, Eigen::half, float, double, complex64, complex128);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
// ROCM TODO: fix compiler error for complex64 / complex128 division
|
||||
DEFINE_BINARY3(div_no_nan, Eigen::half, float, double);
|
||||
#endif
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
|
||||
|
||||
@ -24,4 +24,4 @@ DEFINE_BINARY2(polygamma, float, double);
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -29,7 +29,7 @@ REGISTER5(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
|
||||
REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
|
||||
#endif // __ANDROID_TYPES_SLIM__
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
REGISTER4(BinaryOp, GPU, "Mul", functor::mul, Eigen::half, float, double,
|
||||
uint8);
|
||||
// A special GPU kernel for int32.
|
||||
@ -42,8 +42,15 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
|
||||
.HostMemory("z")
|
||||
.TypeConstraint<int32>("T"),
|
||||
BinaryOp<CPUDevice, functor::mul<int32>>);
|
||||
#endif
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER5(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
|
||||
double, complex64, complex128);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
// ROCM TODO: re-enable complex64 / complex128 after compiler fix
|
||||
REGISTER3(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
|
||||
double);
|
||||
#endif
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
|
@ -21,6 +21,8 @@ REGISTER2(BinaryOp, CPU, "Polygamma", functor::polygamma, float, double);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
REGISTER2(BinaryOp, GPU, "Zeta", functor::zeta, float, double);
|
||||
#endif
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
REGISTER2(BinaryOp, GPU, "Polygamma", functor::polygamma, float, double);
|
||||
#endif
|
||||
} // namespace tensorflow
|
||||
|
@ -176,7 +176,11 @@ template <typename T>
|
||||
struct functor_traits<div_no_nan_op<T>> {
|
||||
enum {
|
||||
Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = true,
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
};
|
||||
};
|
||||
|
||||
@ -189,7 +193,11 @@ template <typename T>
|
||||
struct functor_traits<mul_no_nan_op<T>> {
|
||||
enum {
|
||||
Cost = functor_traits<scalar_product_op<T>>::Cost + NumTraits<T>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = true,
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
};
|
||||
};
|
||||
|
||||
@ -227,7 +235,11 @@ template <typename Tout, typename Tin, typename Binary>
|
||||
struct functor_traits<scalar_left<Tout, Tin, Binary>> {
|
||||
enum {
|
||||
Cost = functor_traits<Binary>::Cost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = functor_traits<Binary>::PacketAccess,
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
};
|
||||
};
|
||||
|
||||
@ -257,7 +269,11 @@ template <typename Tout, typename Tin, typename Binary>
|
||||
struct functor_traits<scalar_right<Tout, Tin, Binary>> {
|
||||
enum {
|
||||
Cost = functor_traits<Binary>::Cost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = functor_traits<Binary>::PacketAccess,
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
};
|
||||
};
|
||||
|
||||
@ -350,8 +366,13 @@ struct google_floor_div {
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x,
|
||||
const T& y) const {
|
||||
if ((x < T(0)) != (y < T(0))) {
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
T abs_x = (x < T(0)) ? -x : x;
|
||||
T abs_y = (y < T(0)) ? -y : y;
|
||||
#else
|
||||
T abs_x = std::abs(x);
|
||||
T abs_y = std::abs(y);
|
||||
#endif
|
||||
return -(abs_x + abs_y - 1) / abs_y;
|
||||
} else {
|
||||
return x / y;
|
||||
@ -392,7 +413,11 @@ struct functor_traits<google_floor_div<Scalar>> {
|
||||
Cost = 2 * Eigen::internal::scalar_div_cost<
|
||||
Scalar, packet_traits<Scalar>::HasDiv>::value +
|
||||
NumTraits<Scalar>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = packet_traits<Scalar>::HasDiv
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -415,8 +440,12 @@ struct functor_traits<google_floor_div_real<Scalar>> {
|
||||
Cost = 2 * Eigen::internal::scalar_div_cost<
|
||||
Scalar, packet_traits<Scalar>::HasDiv>::value +
|
||||
2 * NumTraits<Scalar>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess =
|
||||
packet_traits<Scalar>::HasDiv && packet_traits<Scalar>::HasFloor
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -513,7 +542,11 @@ struct functor_traits<scalar_round_op_google<Scalar>> {
|
||||
enum {
|
||||
Cost = Eigen::NumTraits<Scalar>::IsInteger ? 0
|
||||
: 4 * NumTraits<Scalar>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = Eigen::NumTraits<Scalar>::IsInteger
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -551,7 +584,11 @@ template <typename Scalar, bool IsInteger>
|
||||
struct functor_traits<scalar_round_up_op<Scalar, IsInteger>> {
|
||||
enum {
|
||||
Cost = IsInteger ? 0 : 4 * NumTraits<Scalar>::AddCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = IsInteger || packet_traits<Scalar>::HasFloor
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -604,7 +641,11 @@ struct functor_traits<xlogy_op<Scalar>> {
|
||||
enum {
|
||||
Cost = functor_traits<scalar_log_op<Scalar>>::Cost +
|
||||
Eigen::NumTraits<Scalar>::MulCost,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = functor_traits<scalar_log_op<Scalar>>::PacketAccess
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -635,7 +676,11 @@ struct functor_traits<xdivy_op<Scalar>> {
|
||||
Eigen::NumTraits<Scalar>::AddCost +
|
||||
Eigen::internal::scalar_div_cost<Scalar,
|
||||
packet_traits<Scalar>::HasDiv>::value,
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
PacketAccess = false,
|
||||
#else
|
||||
PacketAccess = packet_traits<Scalar>::HasDiv
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
@ -855,7 +900,7 @@ struct scalar_rint_op {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
|
||||
operator()(const Scalar& a) const {
|
||||
#if defined(__CUDACC__)
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
return ::rint(a);
|
||||
#elif defined(__ANDROID__)
|
||||
return rint(a);
|
||||
@ -985,6 +1030,8 @@ struct scalar_atan2_op {
|
||||
operator()(const Scalar& y, const Scalar& x) const {
|
||||
#if GOOGLE_CUDA
|
||||
return std::atan2(y, x);
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
return ::atan2(y, x);
|
||||
#else
|
||||
return std::atan2(y, x);
|
||||
#endif
|
||||
|
@ -53,38 +53,38 @@ int ColsFromArg(int arg) { return (arg % kRows); }
|
||||
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
|
||||
|
||||
BM_UNARY(cpu, Floor, float, DT_FLOAT);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Floor, float, DT_FLOAT);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_UNARY(sycl, Floor, float, DT_FLOAT);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
|
||||
BM_UNARY(cpu, Floor, double, DT_DOUBLE);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Floor, double, DT_DOUBLE);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_UNARY(sycl, Floor, double, DT_DOUBLE);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
|
||||
BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
BM_UNARY(cpu, Rint, double, DT_DOUBLE);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Rint, double, DT_DOUBLE);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(cpu, Rint, float, DT_FLOAT);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_UNARY(gpu, Rint, float, DT_FLOAT);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
// data func scalar.
|
||||
Graph* BinaryScalar(int num, const string& func) {
|
||||
@ -113,17 +113,17 @@ Graph* BinaryScalar(int num, const string& func) {
|
||||
->Arg(1048576);
|
||||
|
||||
BM_BINARY_SCALAR(cpu, Less);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BINARY_SCALAR(gpu, Less);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BINARY_SCALAR(sycl, Less);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
|
||||
BM_BINARY_SCALAR(cpu, Add);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BINARY_SCALAR(gpu, Add);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BINARY_SCALAR(sycl, Add);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
@ -173,13 +173,13 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
|
||||
|
||||
using Eigen::half;
|
||||
BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#undef BM_BIAS_ADD_ALL
|
||||
#undef BM_BIAS_ADD
|
||||
|
||||
@ -227,18 +227,18 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
|
||||
BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
|
||||
|
||||
using Eigen::half;
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
|
||||
BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#undef BM_BIAS_ADD_GRAD_ALL
|
||||
#undef BM_BIAS_ADD_GRAD
|
||||
|
||||
@ -285,9 +285,9 @@ Graph* BcastAdd(int rows, int cols, int dim) {
|
||||
BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
|
||||
BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
|
||||
BM_BCAST_ADD_ROW_ALL(cpu);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BCAST_ADD_ROW_ALL(gpu);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BCAST_ADD_ROW_ALL(sycl);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
@ -312,9 +312,9 @@ BM_BCAST_ADD_ROW_ALL(sycl);
|
||||
BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
|
||||
BM_BCAST_ADD_COL(DEVICE, 4096, 512);
|
||||
BM_BCAST_ADD_COL_ALL(cpu);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BCAST_ADD_COL_ALL(gpu);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BCAST_ADD_COL_ALL(sycl);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
@ -340,9 +340,9 @@ BM_BCAST_ADD_COL_ALL(sycl);
|
||||
BM_BCAST_ADD_CROSS_RC(DEVICE, 2048, 512); \
|
||||
BM_BCAST_ADD_CROSS_RC(DEVICE, 4096, 512);
|
||||
BM_BCAST_ADD_CROSS_RC_ALL(cpu);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BCAST_ADD_CROSS_RC_ALL(gpu);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BCAST_ADD_CROSS_RC_ALL(sycl);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
@ -368,9 +368,9 @@ BM_BCAST_ADD_CROSS_RC_ALL(sycl);
|
||||
BM_BCAST_ADD_CROSS_CR(DEVICE, 2048, 512); \
|
||||
BM_BCAST_ADD_CROSS_CR(DEVICE, 4096, 512);
|
||||
BM_BCAST_ADD_CROSS_CR_ALL(cpu);
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_BCAST_ADD_CROSS_CR_ALL(gpu);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
BM_BCAST_ADD_CROSS_CR_ALL(sycl);
|
||||
#endif // TENSORFLOW_USE_SYCL
|
||||
|
Loading…
Reference in New Issue
Block a user