Adding ROCm support for tile ops

2019-05-20 18:18:42 +00:00 · 2019-05-20 18:18:42 +00:00 · 2925f2373a
commit 2925f2373a
parent fff00129e1
21 changed files with 58 additions and 58 deletions
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@ -32,10 +32,10 @@ template <typename T>
 void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
                const Tensor& in);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
--- a/tensorflow/core/kernels/tile_functor_gpu.h
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -35,7 +35,7 @@ __global__ void TileKernel(int nthreads, const T* src, const int32* buf,
  const int32* in_strides = buf;
  const int32* out_strides = buf + ndims;
  const int32* in_dim_sizes = buf + ndims * 2;
-  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+  GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
    int32 i_idx = 0;
    int32 t = o_idx;
    for (int i = 0; i < ndims; ++i) {
@ -67,17 +67,17 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
  // device.
  auto num_bytes = sizeof(int64) * host_buf.size();
  auto dev_buf = d.allocate(num_bytes);
-  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // NOTE: host_buf is not allocated by GpuHostAllocator, and
  // therefore we are doing a sync copy effectively.
  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
  // Launch kernel to q[...] = p[...].
  const T* p = in.flat<T>().data();
  T* q = out->flat<T>().data();
-  GpuLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
  TF_CHECK_OK(
-      CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
-                       d.stream(), cfg.virtual_thread_count, p,
-                       reinterpret_cast<const int32*>(dev_buf), ndims, q));
+      GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                      d.stream(), cfg.virtual_thread_count, p,
+                      reinterpret_cast<const int32*>(dev_buf), ndims, q));
  // Safe to deallocate immediately after the kernel launch.
  d.deallocate(dev_buf);
 }
@ -85,6 +85,6 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
 }  // end namespace internal
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
--- a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, bool, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex128, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex64, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, double, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, float, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, Eigen::half, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int16, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int32, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/tile_functor.h"
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int64, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@ -17,9 +17,9 @@ limitations under the License.

 #define EIGEN_USE_THREADS

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include <vector>

@ -82,7 +82,7 @@ struct ReduceAndReshape {
 // Explicit instantiations are defined in tile_ops_{cpu,gpu}_impl.*,
 // below are their declarations.

-#ifdef GOOGLE_CUDA
+#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern template struct Tile<GPUDevice, bool, int32>;
 extern template struct Tile<GPUDevice, bool, int64>;
 extern template struct Tile<GPUDevice, float, int32>;
@ -104,9 +104,9 @@ extern template struct Tile<GPUDevice, int64, int64>;
 #define DECLARE_CUDA_DIM(T, NDIM)                      \
  extern template struct TileGrad<GPUDevice, T, NDIM>; \
  extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>
-#else  // GOOGLE_CUDA
+#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define DECLARE_CUDA_DIM(T, NDIM)
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #ifdef TENSORFLOW_USE_SYCL
 #define DECLARE_TYPE(T)                              \
@ -324,7 +324,7 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_string(HANDLE_TYPE_NAME_CPU);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
@ -334,7 +334,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
@ -590,7 +590,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
@ -599,7 +599,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_half(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #if TENSORFLOW_USE_SYCL
 #define HANDLE_TYPE_NAME_SYCL(T) \
@ -639,7 +639,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                            .TypeConstraint<int64>("Tmultiples"),
                        TileGradientOp<CPUDevice, int64>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_TILE(type)                                    \
  REGISTER_KERNEL_BUILDER(Name("Tile")                             \
                              .Device(DEVICE_GPU)                  \
@ -685,7 +685,7 @@ TF_CALL_complex128(REGISTER_GPU)
 #undef REGISTER_GPU_TILE
 #undef REGISTER_GPU_TILE_GRAD
 #undef REGISTER_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL(type)                                        \
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@ -19,7 +19,7 @@ limitations under the License.
 // Header used to split up compilation of GPU tile ops.  For each type you want
 // to have tile ops, create a .cu.cc file containing
 //
-//   #if GOOGLE_CUDA
+//   #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 //   #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 //   DEFINE_TILE_OPS(NDIM)
 //   #endif  // GOGLE_CUDA
@ -28,8 +28,8 @@ limitations under the License.
 //
 // NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.

-#ifndef GOOGLE_CUDA
-#error "This header must be included inside #ifdef GOOGLE_CUDA"
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error "This header must be included inside with Cuda or ROCm defined"
 #endif

 #define EIGEN_USE_GPU
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_1.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(1)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_2.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(2)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_3.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(3)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_4.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(4)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_5.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(5)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_6.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(6)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_7.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(7)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl_8.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
 DEFINE_TILE_OPS(8)
-#endif  // GOGLE_CUDA
+#endif  // GOGLE_CUDA || TENSORFLOW_USE_ROCM