Fixing and enabling TopK on ROCm
This commit is contained in:
parent
39e13608cb
commit
d8ec97466c
@ -31,17 +31,27 @@ limitations under the license, the license you must see.
|
||||
#include "third_party/gpus/cuda/include/cusparse.h"
|
||||
|
||||
namespace gpuprim = ::cub;
|
||||
|
||||
// Required for sorting Eigen::half
|
||||
namespace cub {
|
||||
template <>
|
||||
struct NumericTraits<Eigen::half>
|
||||
: BaseTraits<FLOATING_POINT, true, false, uint16_t, Eigen::half> {};
|
||||
} // namespace cub
|
||||
|
||||
#elif TENSORFLOW_USE_ROCM
|
||||
#include "rocm/include/hipcub/hipcub.hpp"
|
||||
namespace gpuprim = ::hipcub;
|
||||
|
||||
// Required for sorting Eigen::half
|
||||
namespace rocprim {
|
||||
namespace detail {
|
||||
template <>
|
||||
struct radix_key_codec_base<Eigen::half>
|
||||
: radix_key_codec_floating<Eigen::half, unsigned short> {};
|
||||
: radix_key_codec_floating<Eigen::half, uint16_t> {};
|
||||
}; // namespace detail
|
||||
}; // namespace rocprim
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
#endif // TENSORFLOW_USE_ROCM
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
|
||||
|
@ -76,9 +76,9 @@ static Graph* InTopK(int num_targets, int num_classes, T top_k) {
|
||||
BM_InTopK(int64, 64, 1000, 10, cpu);
|
||||
BM_InTopK(int64, 64, 10000, 10, cpu);
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
BM_InTopK(int64, 64, 1000, 10, gpu);
|
||||
BM_InTopK(int64, 64, 10000, 10, gpu);
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -244,7 +244,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
|
||||
#undef REGISTER_KERNELS_NAME
|
||||
#undef REGISTER_KERNELS
|
||||
|
||||
#ifdef GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
namespace functor {
|
||||
#define DECLARE_GPU_SPEC(T) \
|
||||
@ -277,6 +277,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
|
||||
TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
|
||||
#undef REGISTER_KERNELS
|
||||
|
||||
#endif // end GOOGLE_CUDA
|
||||
#endif // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
@ -15,11 +15,12 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
|
||||
#define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
@ -34,15 +35,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/util/gpu_kernel_helper.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
// Required for sorting Eigen::half
|
||||
namespace cub {
|
||||
template <>
|
||||
struct NumericTraits<Eigen::half>
|
||||
: BaseTraits<FLOATING_POINT, true, false, unsigned short, Eigen::half> {};
|
||||
} // namespace cub
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
typedef Eigen::GpuDevice GPUDevice;
|
||||
@ -93,7 +85,7 @@ struct IndirectLinearData {
|
||||
Entry* const backing_data;
|
||||
};
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
template <typename T>
|
||||
struct StridedData {
|
||||
typedef impl::Entry<T> Entry;
|
||||
@ -115,6 +107,7 @@ template <HeapType heapType, PreferIndices preferIndices,
|
||||
struct IndexedHeap {
|
||||
typedef typename Data<T>::Entry Entry;
|
||||
const Data<T> data;
|
||||
__device__ IndexedHeap(const Data<T>& d) : data(d) {}
|
||||
|
||||
__device__ bool is_above(int left, int right) {
|
||||
T left_value = data.get_value(left);
|
||||
@ -337,12 +330,21 @@ __device__ void mergeShards(int num_shards, int k,
|
||||
}
|
||||
}
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
extern __shared__ char shared_memory[];
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__global__ void TopKKernel(const T* __restrict__ input, int length, int k,
|
||||
bool sorted, T* __restrict__ output,
|
||||
int* __restrict__ indices) {
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
__attribute__((amdgpu_flat_work_group_size(1, 256)))
|
||||
#endif
|
||||
__global__ void TopKKernel(
|
||||
const T* __restrict__ input, int length, int k, bool sorted,
|
||||
T* __restrict__ output, int* __restrict__ indices) {
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
HIP_DYNAMIC_SHARED(char, shared_memory);
|
||||
#endif
|
||||
|
||||
const int batch_index = blockIdx.x;
|
||||
const T* batch_input = input + batch_index * length;
|
||||
|
||||
@ -370,7 +372,7 @@ __global__ void TopKKernel(const T* __restrict__ input, int length, int k,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
|
||||
cudaError LaunchTopKKernel(const gpuStream_t& stream, int num_shards,
|
||||
const T* input, int batch_size, int length, int k,
|
||||
bool sorted, T* output, int* indices) {
|
||||
// This code assumes that k is small enough that the computation
|
||||
@ -395,9 +397,17 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
|
||||
}
|
||||
if (num_shards <= 0) {
|
||||
num_shards = 1;
|
||||
#if GOOGLE_CUDA
|
||||
} else if (num_shards > 1024) {
|
||||
num_shards = 1024;
|
||||
}
|
||||
#else
|
||||
// ROCm can't execute with 1024 and requires an explicit
|
||||
// amdgpu_flat_work_group_size attribute with >256
|
||||
} else if (num_shards > 256) {
|
||||
num_shards = 256;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// We are limited by the amount of shared memory we have per block.
|
||||
auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
|
||||
@ -448,9 +458,9 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
|
||||
input_indices_t.device(d) =
|
||||
input_indices_t.generate(ColumnIndexCreator(num_cols));
|
||||
|
||||
cub::CountingInputIterator<int> counting_iter(0);
|
||||
cub::TransformInputIterator<int, SegmentOffsetCreator,
|
||||
cub::CountingInputIterator<int>>
|
||||
gpuprim::CountingInputIterator<int> counting_iter(0);
|
||||
gpuprim::TransformInputIterator<int, SegmentOffsetCreator,
|
||||
gpuprim::CountingInputIterator<int>>
|
||||
segment_offsets_t(counting_iter, SegmentOffsetCreator(num_cols));
|
||||
|
||||
Tensor temp_values;
|
||||
@ -472,7 +482,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
|
||||
sorted_values_ptr = temp_values.flat<T>().data();
|
||||
}
|
||||
|
||||
auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
/* d_temp_storage */ nullptr,
|
||||
/* temp_storage_bytes */ temp_storage_bytes,
|
||||
/* d_keys_in */ input,
|
||||
@ -489,7 +499,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
|
||||
if (err != cudaSuccess) {
|
||||
return errors::Internal(
|
||||
"TopKOp: Could not launch "
|
||||
"cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
|
||||
"cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to "
|
||||
"calculate "
|
||||
"temp_storage_bytes, status: ",
|
||||
cudaGetErrorString(err));
|
||||
}
|
||||
@ -497,7 +508,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
|
||||
TF_RETURN_IF_ERROR(ctx->allocate_temp(
|
||||
DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
|
||||
&temp_storage));
|
||||
err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
|
||||
/* d_temp_storage */ temp_storage.flat<int8>().data(),
|
||||
/* temp_storage_bytes */ temp_storage_bytes,
|
||||
/* d_keys_in */ input,
|
||||
@ -514,7 +525,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
|
||||
if (err != cudaSuccess) {
|
||||
return errors::Internal(
|
||||
"TopKOp: Could not launch "
|
||||
"cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
|
||||
"cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to sort "
|
||||
"input, "
|
||||
"temp_storage_bytes: ",
|
||||
temp_storage_bytes, ", status: ", cudaGetErrorString(err));
|
||||
}
|
||||
@ -567,6 +579,6 @@ struct TopKFunctor<GPUDevice, T> {
|
||||
} // end namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
||||
#endif // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, double>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, float>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, int16>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, int32>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, int64>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, int8>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -27,4 +27,4 @@ template struct functor::TopKFunctor<GPUDevice, uint32>;
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/core/kernels/topk_op.h"
|
||||
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
|
||||
template struct functor::TopKFunctor<GPUDevice, uint8>;
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -102,11 +102,13 @@ class TopKTest(test.TestCase):
|
||||
self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
|
||||
|
||||
def testTop3(self):
|
||||
k = 5
|
||||
inputs = np.random.permutation(np.linspace(0, 100, 6140, dtype=np.float64))
|
||||
indices = np.argsort(-inputs)[:k]
|
||||
values = -np.sort(-inputs)[:k]
|
||||
self._validateTopK(inputs, k, values, indices)
|
||||
for k in range(3, 11, 2):
|
||||
for dim in range(512, 12288, 512):
|
||||
inputs = np.random.permutation(np.linspace(0, 100, dim,
|
||||
dtype=np.float64))
|
||||
indices = np.argsort(-inputs)[:k]
|
||||
values = -np.sort(-inputs)[:k]
|
||||
self._validateTopK(inputs, k, values, indices)
|
||||
|
||||
def testTop1AllNan(self):
|
||||
inputs = [[np.NaN, np.NaN], [np.NaN, np.NaN]]
|
||||
|
Loading…
Reference in New Issue
Block a user