Adding ROCM support for the population_count op

2019-06-10 15:39:12 +00:00 · 2019-06-10 15:39:12 +00:00 · e7f1163401
commit e7f1163401
parent dac4bd7750
2 changed files with 10 additions and 10 deletions
--- a/tensorflow/core/kernels/population_count_op.cc
+++ b/tensorflow/core/kernels/population_count_op.cc
@ -122,7 +122,7 @@ struct PopulationCount<CPUDevice, T> {

 }  // namespace functor

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_POPULATION_COUNT(type)                                     \
  REGISTER_KERNEL_BUILDER(                                                  \
@ -158,6 +158,6 @@ TF_CALL_int64(DECLARE_GPU_SPEC);

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -35,14 +35,14 @@ namespace functor {
 template <typename T>
 __global__ void PopulationCountKernel(const int size, const T* input,
                                      uint8* output) {
-  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
+  GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
 }

 template <>
 __global__ void PopulationCountKernel(const int size, const int8* input,
                                      uint8* output) {
  // For some reason, __popc on a negative int8 gets confused.
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
  }
 }
@ -51,7 +51,7 @@ template <>
 __global__ void PopulationCountKernel(const int size, const int16* input,
                                      uint8* output) {
  // For some reason, __popc on a negative int16 gets confused.
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
  }
 }
@ -59,7 +59,7 @@ __global__ void PopulationCountKernel(const int size, const int16* input,
 template <>
 __global__ void PopulationCountKernel<int64>(const int size, const int64* input,
                                             uint8* output) {
-  CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
+  GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
 }

 #define DEFINE_GPU_SPECS(T)                                                    \
@ -69,8 +69,8 @@ __global__ void PopulationCountKernel<int64>(const int size, const int64* input,
      TTypes<uint8>::Flat output) {                                            \
    const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
    int64 total_count = input.size();                                          \
-    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);              \
-    TF_CHECK_OK(CudaLaunchKernel(PopulationCountKernel<T>, config.block_count, \
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);              \
+    TF_CHECK_OK(GpuLaunchKernel(PopulationCountKernel<T>, config.block_count, \
                                 config.thread_per_block, 0, d.stream(),       \
                                 total_count, input.data(), output.data()));   \
  }
@ -88,4 +88,4 @@ TF_CALL_int64(DEFINE_GPU_SPECS);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM