Adding ROCm support for the multinomial op

2019-06-25 15:08:11 +00:00 · 2019-06-25 15:08:11 +00:00 · 02afe6c1eb
commit 02afe6c1eb
parent 2f4121b1a7
2 changed files with 22 additions and 15 deletions
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@ -53,7 +53,7 @@ struct MultinomialFunctor {
                  typename TTypes<OutputType>::Matrix output);
 };

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
 extern template struct MultinomialFunctor<GPUDevice, float, int32>;
 extern template struct MultinomialFunctor<GPUDevice, double, int32>;
@ -65,7 +65,7 @@ extern template struct MultinomialFunctor<GPUDevice, float, int64>;
 extern template struct MultinomialFunctor<GPUDevice, double, int64>;
 extern template struct MultinomialFunctor<GPUDevice, int32, int64>;
 extern template struct MultinomialFunctor<GPUDevice, int64, int64>;
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename T, typename OutputType>
 struct MultinomialFunctor<CPUDevice, T, OutputType> {
@ -253,7 +253,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(TYPE)                                                   \
  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
                              .Device(DEVICE_GPU)                        \
@ -273,7 +273,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T, typename OutputType>
 class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
@ -321,7 +321,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER(TYPE)                                                    \
  REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial")                    \
                              .Device(DEVICE_GPU)                         \
@ -343,7 +343,7 @@ TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // end namespace

--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -29,6 +29,12 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"

+#if GOOGLE_CUDA
+namespace gpuprim = ::cub;
+#elif TENSORFLOW_USE_ROCM
+namespace gpuprim = ::hipcub;
+#endif
+
 namespace tensorflow {

 namespace functor {
@ -41,12 +47,12 @@ template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
                                  const int32 num_samples, const float* scores,
                                  const float* maxima, OutputType* output) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    const int maxima_idx = index / num_classes;
    if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
      using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
-      CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
-                    static_cast<UnsignedOutputType>(index % num_classes));
+      GpuAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
+                   static_cast<UnsignedOutputType>(index % num_classes));
    }
  }
 }
@ -98,8 +104,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
    // Max-reduce along classes for each (batch, sample).
    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
    Constants<GPUDevice> constants;
-    cub::Max op;
-    functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
+    gpuprim::Max op;
+    functor::ReduceImpl<float, gpuprim::Max, float*, const float*,
+                        ReductionAxes>(
        /*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
        /*in_dim0=*/batch_size * num_samples,
        /*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
@ -109,8 +116,8 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
    output.device(d) = output.constant(0LL);

    const int32 work_items = batch_size * num_samples * num_classes;
-    GpuLaunchConfig config = GetCudaLaunchConfig(work_items, d);
-    TF_CHECK_OK(CudaLaunchKernel(
+    GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
+    TF_CHECK_OK(GpuLaunchKernel(
        MultinomialKernel<OutputType>, config.block_count,
        config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
        num_classes, num_samples, scores.data(), maxima.data(), output.data()));
@ -133,4 +140,4 @@ template struct MultinomialFunctor<GPUDevice, int64, int64>;
 }  // namespace functor
 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM