From fc9e5a9435a6027f020508ed6fffe7807a657c0d Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 8 Mar 2019 16:44:28 +0000
Subject: [PATCH 1/6] fix for a compile error for the "--config=rocm" build.

---
 tensorflow/stream_executor/rocm/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index 8c452a31d36..902d8f98ee0 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -232,7 +232,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
-        "//tensorflow/stream_executor:logging_proto_cc",
         "//tensorflow/stream_executor:plugin_registry",
         "//tensorflow/stream_executor:scratch_allocator",
         "//tensorflow/stream_executor:stream_executor_pimpl_header",

From 699ff436856d6fb4b4df699710af872f820e087c Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 7 Mar 2019 14:22:22 +0000
Subject: [PATCH 2/6] removing the workaround for hipMalloc / hipHostMalloc

---
 tensorflow/stream_executor/gpu/gpu_types.h    |  2 ++
 .../stream_executor/rocm/rocm_driver.cc       |  4 ++--
 .../rocm/rocm_driver_wrapper.h                | 19 ++++---------------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
index c69177d0760..64a6e5e5efc 100644
--- a/tensorflow/stream_executor/gpu/gpu_types.h
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #if TENSORFLOW_USE_ROCM
 
+#define __HIP_DISABLE_CPP_FUNCTIONS__
+
 #include "rocm/include/hip/hip_complex.h"
 #include "rocm/include/hip/hip_runtime.h"
 #include "rocm/include/hiprand/hiprand.h"
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 1b0b91426aa..94feef06d86 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -671,7 +671,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                              uint64 bytes) {
   ScopedActivateContext activated{context};
   hipDeviceptr_t result = 0;
-  hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
+  hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -717,7 +717,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activation{context};
   void* host_mem = nullptr;
   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
-  hipError_t res = tensorflow::wrap::hipHostMallocVanilla(
+  hipError_t res = tensorflow::wrap::hipHostMalloc(
       &host_mem, bytes, hipHostMallocPortable);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to alloc " << bytes
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
index 27495c2cbc0..c855bfb36a8 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -20,6 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
 
+#define __HIP_DISABLE_CPP_FUNCTIONS__
+
 #include "rocm/include/hip/hip_runtime.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
@@ -48,21 +50,6 @@ namespace wrap {
 #define TO_STR_(x) #x
 #define TO_STR(x) TO_STR_(x)
 
-// hipMalloc and hipHostMalloc are defined as funtion templates in the
-// HIP header files, and hence their names get mangled and the attempt
-// to resolve their name when trying to dynamically load them will fail
-// Updating the HIP header files to make them C functions is underway.
-// Until that change flows through, we will workaround the issue by
-// creating dummy wrappers for them here
-
-hipError_t hipMallocVanilla(void** ptr, size_t size) {
-  return hipErrorNotInitialized;
-}
-
-hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
-  return hipErrorNotInitialized;
-}
-
 #define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
   template <typename... Args>                                               \
   auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
@@ -107,9 +94,11 @@ hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
   __macro(hipGetDeviceCount)                        \
   __macro(hipGetDeviceProperties)                   \
   __macro(hipHostFree)                              \
+  __macro(hipHostMalloc)                            \
   __macro(hipHostRegister)                          \
   __macro(hipHostUnregister)                        \
   __macro(hipInit)                                  \
+  __macro(hipMalloc)                                \
   __macro(hipMemGetAddressRange)                    \
   __macro(hipMemGetInfo)                            \
   __macro(hipMemcpyDtoD)                            \

From 1ef3d8eb9221aecb4e5d42fc9fb157ef1c084e79 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 8 Mar 2019 18:16:07 +0000
Subject: [PATCH 3/6] adding calls to the newer hipMemset APIs

---
 .../stream_executor/rocm/rocm_driver.cc       | 25 +++----------------
 .../rocm/rocm_driver_wrapper.h                |  3 +++
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 94feef06d86..5b79aa0f92e 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -500,7 +500,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                     hipDeviceptr_t location,
                                                     uint8 value, size_t size) {
   ScopedActivateContext activation{context};
-  hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
+  hipError_t res = tensorflow::wrap::hipMemsetD8(location, value, size);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -514,15 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                      size_t uint32_count) {
   ScopedActivateContext activation{context};
   void* pointer = absl::bit_cast<void*>(location);
-  unsigned char valueC = static_cast<unsigned char>(value);
-  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
-  if (value32 != value) {
-    //  mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
-    LOG(ERROR) << "failed to memset memory";
-    return false;
-  }
-  hipError_t res = tensorflow::wrap::hipMemset(pointer, static_cast<int>(value),
-                                               uint32_count * 4);
+  hipError_t res = tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -553,17 +545,8 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                       GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
   void* pointer = absl::bit_cast<void*>(location);
-
-  // FIXME - need to set a 32-bit value here
-  unsigned char valueC = static_cast<unsigned char>(value);
-  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
-  if (value32 != value) {
-    // mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
-    LOG(ERROR) << "failed to memset memory";
-    return false;
-  }
-  hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value,
-                                                    uint32_count * 4, stream);
+  hipError_t res =
+      tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
index c855bfb36a8..ba803edaafb 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -108,7 +108,10 @@ namespace wrap {
   __macro(hipMemcpyHtoD)                            \
   __macro(hipMemcpyHtoDAsync)                       \
   __macro(hipMemset)                                \
+  __macro(hipMemsetD32)                             \
+  __macro(hipMemsetD8)                              \
   __macro(hipMemsetAsync)                           \
+  __macro(hipMemsetD32Async)                        \
   __macro(hipModuleGetFunction)                     \
   __macro(hipModuleGetGlobal)                       \
   __macro(hipModuleLaunchKernel)                    \

From 2f589bfa2a4a917ab89175a070b46e993f19830e Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 7 Mar 2019 18:58:28 +0000
Subject: [PATCH 4/6] adding ROCm support in grappler code that does operator
 cost calculations for devices

---
 tensorflow/core/grappler/clusters/utils.cc | 34 ++++++++++++++++++++++
 tensorflow/tensorflow.bzl                  |  1 +
 2 files changed, 35 insertions(+)

diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 567e7c075e0..f1d3a77e3f0 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#endif
+
 #ifdef EIGEN_USE_LIBXSMM
 #include "include/libxsmm.h"
 #endif
@@ -109,6 +113,36 @@ DeviceProperties GetLocalGPUInfo(PlatformGpuId platform_gpu_id) {
       strings::StrCat(properties.major, ".", properties.minor);
   (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
   (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
+
+#elif TENSORFLOW_USE_ROCM
+  hipDeviceProp_t properties;
+  hipError_t error =
+      hipGetDeviceProperties(&properties, platform_gpu_id.value());
+  if (error != hipSuccess) {
+    device.set_type("UNKNOWN");
+    LOG(ERROR) << "Failed to get device properties, error code: " << error;
+    return device;
+  }
+
+  // ROCM TODO review if numbers here are valid
+  device.set_vendor("Advanced Micro Devices, Inc");
+  device.set_model(properties.name);
+  device.set_frequency(properties.clockRate * 1e-3);
+  device.set_num_cores(properties.multiProcessorCount);
+  device.set_num_registers(properties.regsPerBlock);
+  device.set_l1_cache_size(16 * 1024);
+  device.set_l2_cache_size(properties.l2CacheSize);
+  device.set_l3_cache_size(0);
+  device.set_shared_memory_size_per_multiprocessor(
+      properties.maxSharedMemoryPerMultiProcessor);
+  device.set_memory_size(properties.totalGlobalMem);
+  // 8 is the number of bits per byte. 2 is accounted for
+  // double data rate (DDR).
+  device.set_bandwidth(properties.memoryBusWidth / 8 *
+                       properties.memoryClockRate * 2);
+
+  (*device.mutable_environment())["architecture"] =
+      strings::StrCat("gfx", properties.gcnArch);
 #endif
 
   return device;
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index a7684a17ae7..f22bbc7bdd0 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1209,6 +1209,7 @@ def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs)
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured(cuda_deps + [
             # rocm_header placeholder
+            "@local_config_rocm//rocm:rocm_headers",
         ]),
         copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
         **kwargs

From 13ac37f915a3ec17864338ebdd0da4b5a813d725 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 7 Mar 2019 20:12:17 +0000
Subject: [PATCH 5/6] fixing a bug in the ROCm versions of the Get*DsoHandle
 APIs

---
 .../stream_executor/platform/default/dso_loader.cc     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index a2a9ad67534..6ed7480ff0c 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -158,27 +158,27 @@ port::StatusOr<void*> GetCudnnDsoHandle() {
 
 port::StatusOr<void*> GetRocblasDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
-  return result;
+  return *result;
 }
 
 port::StatusOr<void*> GetMiopenDsoHandle() {
   static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
-  return result;
+  return *result;
 }
 
 port::StatusOr<void*> GetRocfftDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
-  return result;
+  return *result;
 }
 
 port::StatusOr<void*> GetRocrandDsoHandle() {
   static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
-  return result;
+  return *result;
 }
 
 port::StatusOr<void*> GetHipDsoHandle() {
   static auto result = new auto(DsoLoader::GetHipDsoHandle());
-  return result;
+  return *result;
 }
 
 }  // namespace CachedDsoLoader

From 017b6bb043e0f817abfc1be7a50a31007c5943ca Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 6 Mar 2019 19:27:51 +0000
Subject: [PATCH 6/6] adding ROCm support for the softplus_op

---
 tensorflow/core/kernels/softplus_op.cc        | 4 ++--
 tensorflow/core/kernels/softplus_op_gpu.cu.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index d3fc0e1461b..fb00e1bb08c 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -87,7 +87,7 @@ void SoftplusGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
 TF_CALL_FLOAT_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
 #define DECLARE_GPU_SPEC(T)                                          \
@@ -119,6 +119,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softplus_op_gpu.cu.cc b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
index 8df734588b8..900df277a5b 100644
--- a/tensorflow/core/kernels/softplus_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softplus_op_gpu.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
@@ -37,4 +37,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 
 }  // end namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM