Fixing and enabling TopK on ROCm

2020-01-22 12:16:47 -08:00 · 2020-01-22 12:16:47 -08:00 · d8ec97466c
commit d8ec97466c
parent 39e13608cb
14 changed files with 76 additions and 52 deletions
--- a/tensorflow/core/kernels/gpu_prim.h
+++ b/tensorflow/core/kernels/gpu_prim.h
@ -31,17 +31,27 @@ limitations under the license, the license you must see.
 #include "third_party/gpus/cuda/include/cusparse.h"

 namespace gpuprim = ::cub;
+
+// Required for sorting Eigen::half
+namespace cub {
+template <>
+struct NumericTraits<Eigen::half>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, Eigen::half> {};
+}  // namespace cub
+
 #elif TENSORFLOW_USE_ROCM
 #include "rocm/include/hipcub/hipcub.hpp"
 namespace gpuprim = ::hipcub;

+// Required for sorting Eigen::half
 namespace rocprim {
 namespace detail {
 template <>
 struct radix_key_codec_base<Eigen::half>
-    : radix_key_codec_floating<Eigen::half, unsigned short> {};
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
 };  // namespace detail
 };  // namespace rocprim
-#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_USE_ROCM

 #endif  // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_
--- a/tensorflow/core/kernels/in_topk_op_test.cc
+++ b/tensorflow/core/kernels/in_topk_op_test.cc
@ -76,9 +76,9 @@ static Graph* InTopK(int num_targets, int num_classes, T top_k) {
 BM_InTopK(int64, 64, 1000, 10, cpu);
 BM_InTopK(int64, 64, 10000, 10, cpu);

-#ifdef GOOGLE_CUDA
+#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_InTopK(int64, 64, 1000, 10, gpu);
 BM_InTopK(int64, 64, 10000, 10, gpu);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@ -244,7 +244,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS_NAME
 #undef REGISTER_KERNELS

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                                  \
@ -277,6 +277,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS

-#endif  // end GOOGLE_CUDA
+#endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/topk_op_gpu.h
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@ -15,11 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

 #include <cmath>
+#include <string>
 #include <vector>

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@ -34,15 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"

-#if GOOGLE_CUDA
-// Required for sorting Eigen::half
-namespace cub {
-template <>
-struct NumericTraits<Eigen::half>
-    : BaseTraits<FLOATING_POINT, true, false, unsigned short, Eigen::half> {};
-}  // namespace cub
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {

 typedef Eigen::GpuDevice GPUDevice;
@ -93,7 +85,7 @@ struct IndirectLinearData {
  Entry* const backing_data;
 };

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 struct StridedData {
  typedef impl::Entry<T> Entry;
@ -115,6 +107,7 @@ template <HeapType heapType, PreferIndices preferIndices,
 struct IndexedHeap {
  typedef typename Data<T>::Entry Entry;
  const Data<T> data;
+  __device__ IndexedHeap(const Data<T>& d) : data(d) {}

  __device__ bool is_above(int left, int right) {
    T left_value = data.get_value(left);
@ -337,12 +330,21 @@ __device__ void mergeShards(int num_shards, int k,
  }
 }

+#if GOOGLE_CUDA
 extern __shared__ char shared_memory[];
+#endif

 template <typename T>
-__global__ void TopKKernel(const T* __restrict__ input, int length, int k,
-                           bool sorted, T* __restrict__ output,
-                           int* __restrict__ indices) {
+#if TENSORFLOW_USE_ROCM
+__attribute__((amdgpu_flat_work_group_size(1, 256))) 
+#endif
+__global__ void TopKKernel(
+    const T* __restrict__ input, int length, int k, bool sorted,
+    T* __restrict__ output, int* __restrict__ indices) {
+#if TENSORFLOW_USE_ROCM
+  HIP_DYNAMIC_SHARED(char, shared_memory);
+#endif
+
  const int batch_index = blockIdx.x;
  const T* batch_input = input + batch_index * length;

@ -370,7 +372,7 @@ __global__ void TopKKernel(const T* __restrict__ input, int length, int k,
 }

 template <typename T>
-cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
+cudaError LaunchTopKKernel(const gpuStream_t& stream, int num_shards,
                           const T* input, int batch_size, int length, int k,
                           bool sorted, T* output, int* indices) {
  // This code assumes that k is small enough that the computation
@ -395,9 +397,17 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards,
    }
    if (num_shards <= 0) {
      num_shards = 1;
+#if GOOGLE_CUDA
    } else if (num_shards > 1024) {
      num_shards = 1024;
    }
+#else
+      // ROCm can't execute with 1024 and requires an explicit
+      // amdgpu_flat_work_group_size attribute with >256
+    } else if (num_shards > 256) {
+      num_shards = 256;
+    }
+#endif
  }
  // We are limited by the amount of shared memory we have per block.
  auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
@ -448,9 +458,9 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
  input_indices_t.device(d) =
      input_indices_t.generate(ColumnIndexCreator(num_cols));

-  cub::CountingInputIterator<int> counting_iter(0);
-  cub::TransformInputIterator<int, SegmentOffsetCreator,
-                              cub::CountingInputIterator<int>>
+  gpuprim::CountingInputIterator<int> counting_iter(0);
+  gpuprim::TransformInputIterator<int, SegmentOffsetCreator,
+                                  gpuprim::CountingInputIterator<int>>
      segment_offsets_t(counting_iter, SegmentOffsetCreator(num_cols));

  Tensor temp_values;
@ -472,7 +482,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
    sorted_values_ptr = temp_values.flat<T>().data();
  }

-  auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
      /* d_temp_storage */ nullptr,
      /* temp_storage_bytes */ temp_storage_bytes,
      /* d_keys_in */ input,
@ -489,7 +499,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
  if (err != cudaSuccess) {
    return errors::Internal(
        "TopKOp: Could not launch "
-        "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate "
+        "cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to "
+        "calculate "
        "temp_storage_bytes, status: ",
        cudaGetErrorString(err));
  }
@ -497,7 +508,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
  TF_RETURN_IF_ERROR(ctx->allocate_temp(
      DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
      &temp_storage));
-  err = cub::DeviceSegmentedRadixSort::SortPairsDescending(
+  err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
      /* d_temp_storage */ temp_storage.flat<int8>().data(),
      /* temp_storage_bytes */ temp_storage_bytes,
      /* d_keys_in */ input,
@ -514,7 +525,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows,
  if (err != cudaSuccess) {
    return errors::Internal(
        "TopKOp: Could not launch "
-        "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, "
+        "cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to sort "
+        "input, "
        "temp_storage_bytes: ",
        temp_storage_bytes, ", status: ", cudaGetErrorString(err));
  }
@ -567,6 +579,6 @@ struct TopKFunctor<GPUDevice, T> {
 }  // end namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
--- a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, double>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, float>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int16>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int32>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int64>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, int8>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -27,4 +27,4 @@ template struct functor::TopKFunctor<GPUDevice, uint32>;

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU

 #include "tensorflow/core/kernels/topk_op.h"
@ -25,4 +25,4 @@ using Eigen::GpuDevice;
 template struct functor::TopKFunctor<GPUDevice, uint8>;
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@ -102,11 +102,13 @@ class TopKTest(test.TestCase):
    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])

  def testTop3(self):
-    k = 5
-    inputs = np.random.permutation(np.linspace(0, 100, 6140, dtype=np.float64))
-    indices = np.argsort(-inputs)[:k]
-    values = -np.sort(-inputs)[:k]
-    self._validateTopK(inputs, k, values, indices)
+    for k in range(3, 11, 2):
+      for dim in range(512, 12288, 512):
+        inputs = np.random.permutation(np.linspace(0, 100, dim, 
+                                                   dtype=np.float64))
+        indices = np.argsort(-inputs)[:k]
+        values = -np.sort(-inputs)[:k]
+        self._validateTopK(inputs, k, values, indices)

  def testTop1AllNan(self):
    inputs = [[np.NaN, np.NaN], [np.NaN, np.NaN]]