From dfff1ff4371e6490a66bab2163b0e4fc764e429a Mon Sep 17 00:00:00 2001
From: sunway513 <sunway513@gmail.com>
Date: Wed, 4 Mar 2020 11:39:15 -0600
Subject: [PATCH] Update Tensorflow ROCm code base to build with hipclang
 compiler

---
 tensorflow/core/kernels/conv_2d_gpu.h                      | 2 +-
 tensorflow/core/lib/bfloat16/bfloat16.h                    | 2 +-
 .../crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl  | 7 +++++++
 third_party/gpus/rocm_configure.bzl                        | 4 ++++
 4 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 22d7f939686..31abe9dfead 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -236,7 +236,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   // One extra line in the inner dimension to avoid share memory bank conflict.
   // This is to mimic the following, but no constructor of T can be invoked.
   //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
   __shared__ __align__(
       alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
   typedef T(*SharedMemoryTile)[TileSizeJ + 1];
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index a25f4d947ed..d4e33143f29 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/byte_order.h"
 
-#ifdef __CUDACC__
+#if defined(__CUDACC__) || (defined(__HIPCC__) && defined(__HIP__))
 // All functions callable from CUDA code must be qualified with __device__
 #define B16_DEVICE_FUNC __host__ __device__
 
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 8a94afbfde1..f5ac7b39dfd 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -173,6 +173,13 @@ def InvokeHipcc(argv, log=False):
   out = ' -o ' + out_file[0]
 
   hipccopts = ' '
+  # In hip-clang environment, we need to make sure that hip header is included
+  # before some standard math header like <complex> is included in any source.
+  # Otherwise, we get build error.
+  # Also we need to retain warning about uninitialised shared variable as
+  # warning only, even when -Werror option is specified.
+  if HIPCC_IS_HIPCLANG:
+    hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init '
   hipccopts += ' ' + hipcc_compiler_options
   # Use -fno-gpu-rdc by default for early GPU kernel finalization
   # This flag would trigger GPU kernels be generated at compile time, instead
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 20ff2a4aafa..3c345e6724b 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -250,6 +250,10 @@ def _rocm_include_path(repository_ctx, rocm_config):
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
     inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include")
 
+    # Support hcc based off clang 11.0.0, included in ROCm3.1
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/11.0.0/include/")
+    inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/11.0.0/include")
+
     return inc_dirs
 
 def _enable_rocm(repository_ctx):