From 00ab761ec10c74c4ad100c6ebb9a2500d12d2f66 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 12 Jun 2019 22:20:58 +0000
Subject: [PATCH] [ROCm] Fix for the broken `--config=rocm` build

The following PR broken the `--config=rocm` build when it was merged

https://github.com/tensorflow/tensorflow/commit/754ac36f54db34d303a60eb08d34199a7945e576

The above PR uses the Cuda* version of GPU utility APIs. The Cuda* names are not visible in the ROCm build and hence the breakage.
Moving forward it is required for all new code to use the Gpu* names when using the GPU utility routines.
---
 tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
index 05db60327f5..9f6d4a0ea87 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
@@ -33,7 +33,7 @@ __global__ void MatrixDiagKernel(const int num_threads, const int num_rows,
                                  const int lower_diag_index,
                                  const int upper_diag_index, const T padding,
                                  const T* diag_ptr, T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP(index, num_threads) {
+  GPU_1D_KERNEL_LOOP(index, num_threads) {
     const int batch_and_row_index = index / num_cols;
     const int col = index - batch_and_row_index * num_cols;
     const int batch = batch_and_row_index / num_rows;
@@ -69,7 +69,7 @@ struct MatrixDiag<GPUDevice, T> {
     }
     GpuLaunchConfig config =
         GetGpuLaunchConfig(batch_size * num_rows * num_cols, device);
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
         MatrixDiagKernel<T>, config.block_count, config.thread_per_block, 0,
         device.stream(), config.virtual_thread_count, num_rows, num_cols,
         num_diags, max_diag_len, lower_diag_index, upper_diag_index, padding,
@@ -85,7 +85,7 @@ __global__ void MatrixDiagPartKernel(const int num_threads, const int num_rows,
                                      const int upper_diag_index,
                                      const T padding, const T* input_ptr,
                                      T* output_ptr) {
-  CUDA_1D_KERNEL_LOOP(index, num_threads) {
+  GPU_1D_KERNEL_LOOP(index, num_threads) {
     const int batch_and_mapped_diag_index = index / max_diag_len;
     const int index_in_the_diagonal =
         index - batch_and_mapped_diag_index * max_diag_len;
@@ -121,8 +121,8 @@ struct MatrixDiagPart<GPUDevice, T> {
       return;
     }
     GpuLaunchConfig config =
-        GetCudaLaunchConfig(batch_size * num_diags * max_diag_len, device);
-    TF_CHECK_OK(CudaLaunchKernel(
+        GetGpuLaunchConfig(batch_size * num_diags * max_diag_len, device);
+    TF_CHECK_OK(GpuLaunchKernel(
         MatrixDiagPartKernel<T>, config.block_count, config.thread_per_block, 0,
         device.stream(), config.virtual_thread_count, num_rows, num_cols,
         num_diags, max_diag_len, lower_diag_index, upper_diag_index, padding,