Adding ROCm support for tile ops
This commit is contained in:
parent
fff00129e1
commit
2925f2373a
tensorflow/core/kernels
tile_functor.htile_functor_gpu.htile_functor_gpu_bool.cu.cctile_functor_gpu_complex128.cu.cctile_functor_gpu_complex64.cu.cctile_functor_gpu_double.cu.cctile_functor_gpu_float.cu.cctile_functor_gpu_half.cu.cctile_functor_gpu_int16.cu.cctile_functor_gpu_int32.cu.cctile_functor_gpu_int64.cu.cctile_ops.cctile_ops_gpu_impl.htile_ops_gpu_impl_1.cu.cctile_ops_gpu_impl_2.cu.cctile_ops_gpu_impl_3.cu.cctile_ops_gpu_impl_4.cu.cctile_ops_gpu_impl_5.cu.cctile_ops_gpu_impl_6.cu.cctile_ops_gpu_impl_7.cu.cctile_ops_gpu_impl_8.cu.cc
@ -32,10 +32,10 @@ template <typename T>
|
||||
void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
|
||||
const Tensor& in);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
template <typename T>
|
||||
void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
template <typename T>
|
||||
|
@ -16,7 +16,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
@ -35,7 +35,7 @@ __global__ void TileKernel(int nthreads, const T* src, const int32* buf,
|
||||
const int32* in_strides = buf;
|
||||
const int32* out_strides = buf + ndims;
|
||||
const int32* in_dim_sizes = buf + ndims * 2;
|
||||
CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
|
||||
GPU_1D_KERNEL_LOOP(o_idx, nthreads) {
|
||||
int32 i_idx = 0;
|
||||
int32 t = o_idx;
|
||||
for (int i = 0; i < ndims; ++i) {
|
||||
@ -67,17 +67,17 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
|
||||
// device.
|
||||
auto num_bytes = sizeof(int64) * host_buf.size();
|
||||
auto dev_buf = d.allocate(num_bytes);
|
||||
// NOTE: host_buf is not allocated by CudaHostAllocator, and
|
||||
// NOTE: host_buf is not allocated by GpuHostAllocator, and
|
||||
// therefore we are doing a sync copy effectively.
|
||||
d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
|
||||
// Launch kernel to q[...] = p[...].
|
||||
const T* p = in.flat<T>().data();
|
||||
T* q = out->flat<T>().data();
|
||||
GpuLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
|
||||
GpuLaunchConfig cfg = GetGpuLaunchConfig(out_nelem, d);
|
||||
TF_CHECK_OK(
|
||||
CudaLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
|
||||
d.stream(), cfg.virtual_thread_count, p,
|
||||
reinterpret_cast<const int32*>(dev_buf), ndims, q));
|
||||
GpuLaunchKernel(TileKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
|
||||
d.stream(), cfg.virtual_thread_count, p,
|
||||
reinterpret_cast<const int32*>(dev_buf), ndims, q));
|
||||
// Safe to deallocate immediately after the kernel launch.
|
||||
d.deallocate(dev_buf);
|
||||
}
|
||||
@ -85,6 +85,6 @@ void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in) {
|
||||
} // end namespace internal
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, bool, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex128, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, complex64, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, double, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, float, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, Eigen::half, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int16, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int32, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/tile_functor.h"
|
||||
@ -28,4 +28,4 @@ template struct Tile<GpuDevice, int64, int64>;
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -17,9 +17,9 @@ limitations under the License.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -82,7 +82,7 @@ struct ReduceAndReshape {
|
||||
// Explicit instantiations are defined in tile_ops_{cpu,gpu}_impl.*,
|
||||
// below are their declarations.
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
extern template struct Tile<GPUDevice, bool, int32>;
|
||||
extern template struct Tile<GPUDevice, bool, int64>;
|
||||
extern template struct Tile<GPUDevice, float, int32>;
|
||||
@ -104,9 +104,9 @@ extern template struct Tile<GPUDevice, int64, int64>;
|
||||
#define DECLARE_CUDA_DIM(T, NDIM) \
|
||||
extern template struct TileGrad<GPUDevice, T, NDIM>; \
|
||||
extern template struct ReduceAndReshape<GPUDevice, T, NDIM, 1>
|
||||
#else // GOOGLE_CUDA
|
||||
#else // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define DECLARE_CUDA_DIM(T, NDIM)
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
#define DECLARE_TYPE(T) \
|
||||
@ -324,7 +324,7 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
|
||||
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
|
||||
TF_CALL_string(HANDLE_TYPE_NAME_CPU);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_float(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_double(HANDLE_TYPE_NAME_GPU);
|
||||
@ -334,7 +334,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_half(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
|
||||
@ -590,7 +590,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
|
||||
TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
|
||||
TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
TF_CALL_float(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_double(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
|
||||
@ -599,7 +599,7 @@ TF_CALL_int64(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_half(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
|
||||
TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#if TENSORFLOW_USE_SYCL
|
||||
#define HANDLE_TYPE_NAME_SYCL(T) \
|
||||
@ -639,7 +639,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
|
||||
.TypeConstraint<int64>("Tmultiples"),
|
||||
TileGradientOp<CPUDevice, int64>);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define REGISTER_GPU_TILE(type) \
|
||||
REGISTER_KERNEL_BUILDER(Name("Tile") \
|
||||
.Device(DEVICE_GPU) \
|
||||
@ -685,7 +685,7 @@ TF_CALL_complex128(REGISTER_GPU)
|
||||
#undef REGISTER_GPU_TILE
|
||||
#undef REGISTER_GPU_TILE_GRAD
|
||||
#undef REGISTER_GPU
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#ifdef TENSORFLOW_USE_SYCL
|
||||
#define REGISTER_SYCL(type) \
|
||||
|
@ -19,7 +19,7 @@ limitations under the License.
|
||||
// Header used to split up compilation of GPU tile ops. For each type you want
|
||||
// to have tile ops, create a .cu.cc file containing
|
||||
//
|
||||
// #if GOOGLE_CUDA
|
||||
// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
// #include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
// DEFINE_TILE_OPS(NDIM)
|
||||
// #endif // GOGLE_CUDA
|
||||
@ -28,8 +28,8 @@ limitations under the License.
|
||||
//
|
||||
// NOTE(keveman): Eigen's int8 and string versions don't compile yet with nvcc.
|
||||
|
||||
#ifndef GOOGLE_CUDA
|
||||
#error "This header must be included inside #ifdef GOOGLE_CUDA"
|
||||
#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
|
||||
#error "This header must be included inside with Cuda or ROCm defined"
|
||||
#endif
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(1)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(2)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(3)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(4)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(5)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(6)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(7)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#include "tensorflow/core/kernels/tile_ops_gpu_impl.h"
|
||||
DEFINE_TILE_OPS(8)
|
||||
#endif // GOGLE_CUDA
|
||||
#endif // GOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
Loading…
Reference in New Issue
Block a user