Adding ROCm support for the multinomial op

This commit is contained in:
Deven Desai 2019-06-25 15:08:11 +00:00
parent 2f4121b1a7
commit 02afe6c1eb
2 changed files with 22 additions and 15 deletions

View File

@ -53,7 +53,7 @@ struct MultinomialFunctor {
typename TTypes<OutputType>::Matrix output);
};
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
extern template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
extern template struct MultinomialFunctor<GPUDevice, float, int32>;
extern template struct MultinomialFunctor<GPUDevice, double, int32>;
@ -65,7 +65,7 @@ extern template struct MultinomialFunctor<GPUDevice, float, int64>;
extern template struct MultinomialFunctor<GPUDevice, double, int64>;
extern template struct MultinomialFunctor<GPUDevice, int32, int64>;
extern template struct MultinomialFunctor<GPUDevice, int64, int64>;
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T, typename OutputType>
struct MultinomialFunctor<CPUDevice, T, OutputType> {
@ -253,7 +253,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER(TYPE) \
REGISTER_KERNEL_BUILDER(Name("Multinomial") \
.Device(DEVICE_GPU) \
@ -273,7 +273,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename Device, typename T, typename OutputType>
class StatelessMultinomialOp : public MultinomialOp<Device, T, OutputType> {
@ -321,7 +321,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define REGISTER(TYPE) \
REGISTER_KERNEL_BUILDER(Name("StatelessMultinomial") \
.Device(DEVICE_GPU) \
@ -343,7 +343,7 @@ TF_CALL_float(REGISTER);
TF_CALL_double(REGISTER);
#undef REGISTER
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
} // end namespace

View File

@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#if GOOGLE_CUDA
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#define EIGEN_USE_GPU
@ -29,6 +29,12 @@ limitations under the License.
#include "tensorflow/core/lib/random/random_distributions.h"
#include "tensorflow/core/util/gpu_kernel_helper.h"
#if GOOGLE_CUDA
namespace gpuprim = ::cub;
#elif TENSORFLOW_USE_ROCM
namespace gpuprim = ::hipcub;
#endif
namespace tensorflow {
namespace functor {
@ -41,12 +47,12 @@ template <typename OutputType>
__global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
const int32 num_samples, const float* scores,
const float* maxima, OutputType* output) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
GPU_1D_KERNEL_LOOP(index, nthreads) {
const int maxima_idx = index / num_classes;
if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
static_cast<UnsignedOutputType>(index % num_classes));
GpuAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
static_cast<UnsignedOutputType>(index % num_classes));
}
}
}
@ -98,8 +104,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
// Max-reduce along classes for each (batch, sample).
typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
Constants<GPUDevice> constants;
cub::Max op;
functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
gpuprim::Max op;
functor::ReduceImpl<float, gpuprim::Max, float*, const float*,
ReductionAxes>(
/*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
/*in_dim0=*/batch_size * num_samples,
/*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
@ -109,8 +116,8 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
output.device(d) = output.constant(0LL);
const int32 work_items = batch_size * num_samples * num_classes;
GpuLaunchConfig config = GetCudaLaunchConfig(work_items, d);
TF_CHECK_OK(CudaLaunchKernel(
GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
TF_CHECK_OK(GpuLaunchKernel(
MultinomialKernel<OutputType>, config.block_count,
config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
num_classes, num_samples, scores.data(), maxima.data(), output.data()));
@ -133,4 +140,4 @@ template struct MultinomialFunctor<GPUDevice, int64, int64>;
} // namespace functor
} // namespace tensorflow
#endif // GOOGLE_CUDA
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM