Add ROCm support for launching 2D GPU convolutions
This commit is contained in:
parent
ef0b1eff8d
commit
82ccb9a50d
@ -16,7 +16,7 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
||||||
#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -25,7 +25,9 @@ limitations under the License.
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
|
#if GOOGLE_CUDA
|
||||||
#include "third_party/gpus/cuda/include/cuda.h"
|
#include "third_party/gpus/cuda/include/cuda.h"
|
||||||
|
#endif
|
||||||
#include "tensorflow/core/framework/register_types.h"
|
#include "tensorflow/core/framework/register_types.h"
|
||||||
#include "tensorflow/core/kernels/conv_2d.h"
|
#include "tensorflow/core/kernels/conv_2d.h"
|
||||||
#include "tensorflow/core/lib/math/math_util.h"
|
#include "tensorflow/core/lib/math/math_util.h"
|
||||||
@ -49,7 +51,7 @@ struct maybe_conj {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Partial specializations for Cuda types used to store complex numbers.
|
// Partial specializations for Gpu types used to store complex numbers.
|
||||||
template <bool conjugate>
|
template <bool conjugate>
|
||||||
struct maybe_conj<float2, conjugate> {
|
struct maybe_conj<float2, conjugate> {
|
||||||
__device__ static __inline__ float2 run(float2 c) {
|
__device__ static __inline__ float2 run(float2 c) {
|
||||||
@ -191,7 +193,7 @@ __global__ void ShuffleInTensor3Simple(int nthreads, const T* input,
|
|||||||
// performance. Iterating over output will generate sequential writes and
|
// performance. Iterating over output will generate sequential writes and
|
||||||
// random reads that performs better compared to sequential reads and random
|
// random reads that performs better compared to sequential reads and random
|
||||||
// writes.
|
// writes.
|
||||||
CUDA_1D_KERNEL_LOOP(output_index, nthreads) {
|
GPU_1D_KERNEL_LOOP(output_index, nthreads) {
|
||||||
Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
|
Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
|
||||||
|
|
||||||
Index<3> input_tensor_index;
|
Index<3> input_tensor_index;
|
||||||
@ -232,11 +234,15 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
// One extra line in the inner dimension to avoid share memory bank conflict.
|
// One extra line in the inner dimension to avoid share memory bank conflict.
|
||||||
// This is to mimic the following, but no constructor of T can be invoked.
|
// This is to mimic the following, but no constructor of T can be invoked.
|
||||||
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
// __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||||
|
#if GOOGLE_CUDA
|
||||||
__shared__ __align__(
|
__shared__ __align__(
|
||||||
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
|
||||||
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
typedef T(*SharedMemoryTile)[TileSizeJ + 1];
|
||||||
SharedMemoryTile shared_memory_tile =
|
SharedMemoryTile shared_memory_tile =
|
||||||
reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
|
reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
|
||||||
|
#elif TENSORFLOW_USE_ROCM
|
||||||
|
__shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
|
||||||
|
#endif
|
||||||
|
|
||||||
int x = threadIdx.x;
|
int x = threadIdx.x;
|
||||||
|
|
||||||
@ -357,14 +363,14 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A Cuda custom kernel that convert input to output, given proper padding on
|
// A Gpu custom kernel that convert input to output, given proper padding on
|
||||||
// the left and the top. The padded value is zero.
|
// the left and the top. The padded value is zero.
|
||||||
template <typename T, int NDIMS>
|
template <typename T, int NDIMS>
|
||||||
__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
|
__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
|
||||||
Dimension<NDIMS> input_dims, T* output,
|
Dimension<NDIMS> input_dims, T* output,
|
||||||
Dimension<NDIMS> output_dims,
|
Dimension<NDIMS> output_dims,
|
||||||
Dimension<NDIMS - 2> padding_left) {
|
Dimension<NDIMS - 2> padding_left) {
|
||||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||||
int output_index = index;
|
int output_index = index;
|
||||||
Index<NDIMS> output_tensor_index =
|
Index<NDIMS> output_tensor_index =
|
||||||
FlatToTensorIndex(output_index, output_dims);
|
FlatToTensorIndex(output_index, output_dims);
|
||||||
@ -393,7 +399,7 @@ __global__ void PadInputCustomKernelNCHW(int nthreads, const T* input,
|
|||||||
Dimension<NDIMS> input_dims, T* output,
|
Dimension<NDIMS> input_dims, T* output,
|
||||||
Dimension<NDIMS> output_dims,
|
Dimension<NDIMS> output_dims,
|
||||||
Dimension<NDIMS - 2> padding_left) {
|
Dimension<NDIMS - 2> padding_left) {
|
||||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
GPU_1D_KERNEL_LOOP(index, nthreads) {
|
||||||
int output_index = index;
|
int output_index = index;
|
||||||
Index<NDIMS> output_tensor_index =
|
Index<NDIMS> output_tensor_index =
|
||||||
FlatToTensorIndex(output_index, output_dims);
|
FlatToTensorIndex(output_index, output_dims);
|
||||||
@ -432,19 +438,19 @@ struct TransformFilter<GPUDevice, T, int, NDIMS> {
|
|||||||
}
|
}
|
||||||
combined_dims[1] = in.dimension(NDIMS - 2); // input filters
|
combined_dims[1] = in.dimension(NDIMS - 2); // input filters
|
||||||
combined_dims[2] = in.dimension(NDIMS - 1); // output filters
|
combined_dims[2] = in.dimension(NDIMS - 1); // output filters
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
||||||
|
|
||||||
if (dst_filter_format == FORMAT_OIHW) {
|
if (dst_filter_format == FORMAT_OIHW) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
|
||||||
config.block_count, config.thread_per_block,
|
config.block_count, config.thread_per_block,
|
||||||
0, d.stream(), config.virtual_thread_count,
|
0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), combined_dims, out.data()));
|
in.data(), combined_dims, out.data()));
|
||||||
|
|
||||||
} else if (dst_filter_format == FORMAT_OHWI) {
|
} else if (dst_filter_format == FORMAT_OHWI) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 1, 2, 0>,
|
||||||
config.block_count, config.thread_per_block,
|
config.block_count, config.thread_per_block,
|
||||||
0, d.stream(), config.virtual_thread_count,
|
0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), combined_dims, out.data()));
|
in.data(), combined_dims, out.data()));
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
LOG(ERROR) << "Unsupported filter format: "
|
LOG(ERROR) << "Unsupported filter format: "
|
||||||
@ -471,11 +477,11 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
|
|||||||
combined_dims[2] *= in.dimension(i);
|
combined_dims[2] *= in.dimension(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0>,
|
||||||
config.block_count, config.thread_per_block,
|
config.block_count, config.thread_per_block,
|
||||||
0, d.stream(), config.virtual_thread_count,
|
0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), combined_dims, out.data()));
|
in.data(), combined_dims, out.data()));
|
||||||
|
|
||||||
} else if (src_filter_format == FORMAT_OHWI) {
|
} else if (src_filter_format == FORMAT_OHWI) {
|
||||||
combined_dims[0] = in.dimension(0); // output filters
|
combined_dims[0] = in.dimension(0); // output filters
|
||||||
@ -485,11 +491,11 @@ struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
|
|||||||
}
|
}
|
||||||
combined_dims[2] = in.dimension(NDIMS - 1); // input filters
|
combined_dims[2] = in.dimension(NDIMS - 1); // input filters
|
||||||
|
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 0, 1>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 0, 1>,
|
||||||
config.block_count, config.thread_per_block,
|
config.block_count, config.thread_per_block,
|
||||||
0, d.stream(), config.virtual_thread_count,
|
0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), combined_dims, out.data()));
|
in.data(), combined_dims, out.data()));
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// TODO(ezhulenev): Set error status in OpKernelContext instead.
|
// TODO(ezhulenev): Set error status in OpKernelContext instead.
|
||||||
@ -510,7 +516,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
|
|||||||
const std::array<int, NDIMS - 2>& padding_right,
|
const std::array<int, NDIMS - 2>& padding_right,
|
||||||
typename TTypes<T, NDIMS, int>::Tensor out,
|
typename TTypes<T, NDIMS, int>::Tensor out,
|
||||||
TensorFormat format) {
|
TensorFormat format) {
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
||||||
Dimension<NDIMS> input_dims;
|
Dimension<NDIMS> input_dims;
|
||||||
for (int i = 0; i < NDIMS; ++i) {
|
for (int i = 0; i < NDIMS; ++i) {
|
||||||
input_dims[i] = in.dimension(i);
|
input_dims[i] = in.dimension(i);
|
||||||
@ -523,12 +529,12 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
|
|||||||
const Dimension<NDIMS - 2> padding_left_dim(padding_left);
|
const Dimension<NDIMS - 2> padding_left_dim(padding_left);
|
||||||
|
|
||||||
if (format == FORMAT_NHWC) {
|
if (format == FORMAT_NHWC) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
|
PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
|
||||||
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
|
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), input_dims, out.data(), output_dims, padding_left_dim));
|
in.data(), input_dims, out.data(), output_dims, padding_left_dim));
|
||||||
} else if (format == FORMAT_NCHW) {
|
} else if (format == FORMAT_NCHW) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
|
PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
|
||||||
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
|
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
|
||||||
in.data(), input_dims, out.data(), output_dims, padding_left_dim));
|
in.data(), input_dims, out.data(), output_dims, padding_left_dim));
|
||||||
@ -640,13 +646,13 @@ void LaunchBatchNarrowMatrixTransposeKernel(
|
|||||||
const T* input, const Dimension<3>& input_dims, T* output) {
|
const T* input, const Dimension<3>& input_dims, T* output) {
|
||||||
constexpr int NumThreads = TileLongSide;
|
constexpr int NumThreads = TileLongSide;
|
||||||
if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
|
if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
|
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
|
||||||
TileShortSide>,
|
TileShortSide>,
|
||||||
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
|
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
|
||||||
output));
|
output));
|
||||||
} else {
|
} else {
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
|
SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
|
||||||
TileLongSide>,
|
TileLongSide>,
|
||||||
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
|
total_tiles_count, NumThreads, 0, d.stream(), input, input_dims,
|
||||||
@ -951,8 +957,7 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
|
|||||||
|
|
||||||
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
|
int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
|
||||||
input_dims_in_tiles[2];
|
input_dims_in_tiles[2];
|
||||||
|
TF_CHECK_OK(GpuLaunchKernel(
|
||||||
TF_CHECK_OK(CudaLaunchKernel(
|
|
||||||
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize,
|
SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize,
|
||||||
kTileSize, conjugate>,
|
kTileSize, conjugate>,
|
||||||
total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims,
|
total_tiles_count, kNumThreads, 0, d.stream(), input, input_dims,
|
||||||
@ -963,11 +968,11 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
|
|||||||
d, input, input_dims, output, kMinDimensionToUseTiles);
|
d, input, input_dims, output, kMinDimensionToUseTiles);
|
||||||
} else {
|
} else {
|
||||||
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
|
int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(total_element_count, d);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>,
|
||||||
config.block_count, config.thread_per_block, 0,
|
config.block_count, config.thread_per_block, 0,
|
||||||
d.stream(), config.virtual_thread_count, input,
|
d.stream(), config.virtual_thread_count, input,
|
||||||
input_dims, output));
|
input_dims, output));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -996,11 +1001,11 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
|
|||||||
static_cast<int>(combined_dims[1]),
|
static_cast<int>(combined_dims[1]),
|
||||||
static_cast<int>(combined_dims[2])};
|
static_cast<int>(combined_dims[2])};
|
||||||
size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
|
size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
|
||||||
GpuLaunchConfig config = GetCudaLaunchConfig(total_size, d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d);
|
||||||
TF_CHECK_OK(CudaLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
|
TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
|
||||||
config.block_count, config.thread_per_block, 0,
|
config.block_count, config.thread_per_block, 0,
|
||||||
d.stream(), config.virtual_thread_count, in,
|
d.stream(), config.virtual_thread_count, in,
|
||||||
input_dims, out));
|
input_dims, out));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1043,6 +1048,6 @@ struct NCHWToNHWC<GPUDevice, T, NDIMS> {
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#endif // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
#endif // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -47,4 +47,4 @@ template struct PadInput<Eigen::GpuDevice, double, int, 5>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -60,4 +60,4 @@ template struct PadInput<Eigen::GpuDevice, float, int, 5>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -54,4 +54,4 @@ template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
|
|
||||||
#if GOOGLE_CUDA
|
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
|
||||||
#define EIGEN_USE_GPU
|
#define EIGEN_USE_GPU
|
||||||
|
|
||||||
@ -35,4 +35,4 @@ template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
|
|||||||
} // namespace functor
|
} // namespace functor
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
#endif // GOOGLE_CUDA
|
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||||
|
Loading…
Reference in New Issue
Block a user