Adding ROCm support for tile ops

This commit is contained in:
Deven Desai 2019-05-20 18:18:42 +00:00
parent fff00129e1
commit 2925f2373a
21 changed files with 58 additions and 58 deletions

View File

@ -32,10 +32,10 @@ template <typename T>
void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
const Tensor& in);
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifdef TENSORFLOW_USE_SYCL
template <typename T>

View File

@ -16,7 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -35,7 +35,7 @@ __global__ void TileKernel(int nthreads, const T* src, const int32* buf,
const int32* in_strides = buf;
const int32* out_strides = buf + ndims;
const int32* in_dim_sizes = buf + ndims * 2;
CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
int32 i_idx = 0;
int32 t = o_idx;
for (int i = 0; i < ndims; ++i) {
@ -67,17 +67,17 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
// device.
auto num_bytes = sizeof(int64) * host_buf.size();
auto dev_buf = d.allocate(num_bytes);
// NOTE: host_buf is not allocated by CudaHostAllocator, and
// NOTE: host_buf is not allocated by GpuHostAllocator, and
// therefore we are doing a sync copy effectively.
d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
// Launch kernel to q[...] = p[...].
const T* p = in.flat<T>().data();
T* q = out->flat<T>().data();
GpuLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
TF_CHECK_OK(
CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
d.stream(), cfg.virtual_thread_count, p,
reinterpret_cast<const int32*>(dev_buf), ndims, q));
GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
d.stream(), cfg.virtual_thread_count, p,
reinterpret_cast<const int32*>(dev_buf), ndims, q));
// Safe to deallocate immediately after the kernel launch.
d.deallocate(dev_buf);
}
@ -85,6 +85,6 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
} // end namespace internal
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#endif // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, bool, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex128, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex64, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, double, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, float, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, Eigen::half, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int16, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int32, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int64, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -17,9 +17,9 @@ limitations under the License.
#define EIGEN_USE_THREADS
#ifdef GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include <vector>
@ -82,7 +82,7 @@ struct ReduceAndReshape {
// Explicit instantiations are defined in tile_ops_{cpu,gpu}_impl.*,
// below are their declarations.
#ifdef GOOGLE_CUDA
#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM
extern template struct Tile<GPUDevice, bool, int32>;
extern template struct Tile<GPUDevice, bool, int64>;
extern template struct Tile<GPUDevice, float, int32>;
@ -104,9 +104,9 @@ extern template struct Tile<GPUDevice, int64, int64>;
#define DECLARE_CUDA_DIM(T, NDIM) \
extern template struct TileGrad<GPUDevice, T, NDIM>; \
extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>
#else // GOOGLE_CUDA
#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define DECLARE_CUDA_DIM(T, NDIM)
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifdef TENSORFLOW_USE_SYCL
#define DECLARE_TYPE(T) \
@ -324,7 +324,7 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
TF_CALL_string(HANDLE_TYPE_NAME_CPU);
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
TF_CALL_float(HANDLE_TYPE_NAME_GPU);
TF_CALL_double(HANDLE_TYPE_NAME_GPU);
@ -334,7 +334,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
TF_CALL_half(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifdef TENSORFLOW_USE_SYCL
TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
@ -590,7 +590,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
TF_CALL_float(HANDLE_TYPE_NAME_GPU);
TF_CALL_double(HANDLE_TYPE_NAME_GPU);
TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
@ -599,7 +599,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
TF_CALL_half(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#if TENSORFLOW_USE_SYCL
#define HANDLE_TYPE_NAME_SYCL(T) \
@ -639,7 +639,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
.TypeConstraint<int64>("Tmultiples"),
TileGradientOp<CPUDevice, int64>);
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER_GPU_TILE(type) \
REGISTER_KERNEL_BUILDER(Name("Tile") \
.Device(DEVICE_GPU) \
@ -685,7 +685,7 @@ TF_CALL_complex128(REGISTER_GPU)
#undef REGISTER_GPU_TILE
#undef REGISTER_GPU_TILE_GRAD
#undef REGISTER_GPU
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(type) \

View File

@ -19,7 +19,7 @@ limitations under the License.
// Header used to split up compilation of GPU tile ops. For each type you want
// to have tile ops, create a .cu.cc file containing
//
// #if GOOGLE_CUDA
// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
// DEFINE_TILE_OPS(NDIM)
// #endif // GOGLE_CUDA
@ -28,8 +28,8 @@ limitations under the License.
//
// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
#ifndef GOOGLE_CUDA
#error "This header must be included inside #ifdef GOOGLE_CUDA"
#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
#error "This header must be included inside with Cuda or ROCm defined"
#endif
#define EIGEN_USE_GPU

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(1)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(2)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(3)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(4)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(5)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(6)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(7)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
DEFINE_TILE_OPS(8)
#endif // GOGLE_CUDA
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM