Adding ROCm support for comapre_and_bitpack op

2019-05-29 22:13:31 +00:00 · 2019-05-29 22:13:31 +00:00 · bf65e06819
commit bf65e06819
parent b211c7a053
2 changed files with 13 additions and 13 deletions
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@ -164,7 +164,7 @@ struct CompareAndBitpack<CPUDevice, T> {

 }  // namespace functor

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_COMPARE_AND_BITPACK(type)                                    \
  REGISTER_KERNEL_BUILDER(                                                    \
@ -193,6 +193,6 @@ TF_CALL_bool(DECLARE_GPU_SPEC)

 }  // namespace functor

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -41,7 +41,7 @@ __global__ void CompareAndBitpackKernel(const int size, const T* threshold,
  // result for 4 blocks) followed by an appropriate shift and mask to
  // get the 8-bits of interest.
  const T thresh = ldg(threshold);
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const T* block = input + 8 * i;
    output[i] =
        ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
@ -61,7 +61,7 @@ __global__ void CompareAndBitpackKernel<bool>(const int size,
  // TODO(ebrevdo): Erich said: I think you could again have multiple
  // threads work on one block and use the ballot instruction to the
  // bit packing in one instruction.
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
    // NOTE(ebrevdo): This assumes memory is little-endian.
    output[i] =
@ -81,7 +81,7 @@ __global__ void CompareAndBitpackKernel<float>(const int size,
                                               const float* input,
                                               uint8* output) {
  const float thresh = ldg(threshold);
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
    const float4 block1 =
        ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
@ -98,7 +98,7 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
                                                const double* input,
                                                uint8* output) {
  const double thresh = ldg(threshold);
-  CUDA_1D_KERNEL_LOOP(i, size) {
+  GPU_1D_KERNEL_LOOP(i, size) {
    const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
    const double2 block1 =
        ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
@ -121,12 +121,12 @@ __global__ void CompareAndBitpackKernel<double>(const int size,
      TTypes<uint8>::Matrix output) {                                          \
    const GPUDevice& d = c->eigen_device<GPUDevice>();                         \
    int64 total_count = output.size();                                         \
-    GpuLaunchConfig config = GetCudaLaunchConfig(total_count, d);              \
+    GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);               \
                                                                               \
-    TF_CHECK_OK(CudaLaunchKernel(CompareAndBitpackKernel<T>,                   \
-                                 config.block_count, config.thread_per_block,  \
-                                 0, d.stream(), total_count, threshold.data(), \
-                                 input.data(), output.data()));                \
+    TF_CHECK_OK(GpuLaunchKernel(CompareAndBitpackKernel<T>,                    \
+                                config.block_count, config.thread_per_block,   \
+                                0, d.stream(), total_count, threshold.data(),  \
+                                input.data(), output.data()));                 \
  }

 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
@ -138,4 +138,4 @@ TF_CALL_bool(DEFINE_GPU_SPECS)

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM