From b7ab9b9e5fcff44588763a2c97103b1ae6468e06 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <whchung@gmail.com>
Date: Fri, 9 Aug 2019 17:36:19 +0000
Subject: [PATCH 1/2] Enable XLA JIT compiler on ROCm.

- bazel changes to enable xla_gpu_device and xla_gpu_jit on ROCm.
- Disable cusolver_context on ROCm. It has source code dependency to CUDA API.
- Disable dependency to cholesky_thunk on ROCm. It has source code dependency
  to CUDA API.
- Remove cudnn_conv_algorithm_picker from gpu_compiler_impl dependency list.
  It is conditionally dependent when CUDA is enabled.
- Remove CUDA-specific header inclusions in collective_permute_thunk and
  custom_call_thunk. These 2 thunks actually work on ROCm.
- Partially enable ptxas_utils.h to make things build on ROCm. Full-fledged
  solution is on PR #30884.
---
 tensorflow/compiler/jit/BUILD                    | 16 ++++++++++++++--
 tensorflow/compiler/xla/service/gpu/BUILD        |  9 ++++-----
 .../xla/service/gpu/collective_permute_thunk.cc  |  1 -
 .../xla/service/gpu/custom_call_thunk.cc         |  1 -
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 1ebfe235b4d..16848110ece 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,5 +1,8 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
 load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
@@ -38,7 +41,10 @@ cc_library(
         ":xla_cpu_device",
         ":xla_cpu_jit",
         "//tensorflow/compiler/plugin",
-    ] + if_cuda([
+    ] + if_cuda_is_configured([
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ]) + if_rocm_is_configured([
         ":xla_gpu_device",
         ":xla_gpu_jit",
     ]),
@@ -61,7 +67,13 @@ cc_library(
 cc_library(
     name = "xla_gpu_jit",
     visibility = ["//visibility:public"],
-    deps = if_cuda([
+    deps = if_cuda_is_configured([
+        ":jit_compilation_passes",
+        "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]) + if_rocm_is_configured([
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 9f709ddb058..08a4903eea2 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -731,9 +731,9 @@ tf_cc_test(
 
 cc_library(
     name = "cusolver_context",
-    srcs = ["cusolver_context.cc"],
-    hdrs = ["cusolver_context.h"],
-    deps = [
+    srcs = if_cuda_is_configured(["cusolver_context.cc"]),
+    hdrs = if_cuda_is_configured(["cusolver_context.h"]),
+    deps = if_cuda_is_configured([
         # LINT.IfChange
         "@local_config_cuda//cuda:cublas_headers",
         # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
@@ -745,7 +745,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor/cuda:cusolver_lib",
-    ],
+    ]),
 )
 
 cc_library(
@@ -1053,7 +1053,6 @@ cc_library(
     deps = [
         ":alias_passthrough_params",
         ":cudnn_batchnorm_rewriter",
-        ":cudnn_conv_algorithm_picker",
         ":cudnn_conv_padding_legalization",
         ":cudnn_conv_rewriter",
         ":fusion_merger",
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index 60301b4de64..2fe359861f8 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/compiler/xla/refcounting_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index 65673106391..85571804315 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h"
 
 #include "absl/strings/str_format.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
 namespace xla {

From 4fd5b07bd7c0db869509803c6c2b0d9d13a82bb1 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <whchung@gmail.com>
Date: Thu, 22 Aug 2019 16:41:57 +0000
Subject: [PATCH 2/2] Skip including cholesky_thunk in thunk_emitter on ROCm
 build.

cholesky_thunk has source code dependency to cusolver_context which has source
code dependency to CUDA-specific headers. Disable them on ROCm build.

Since GOOGLE_CUDA macros is discouraged in XLA, use !TENSORFLOW_USE_ROCM
instead.
---
 tensorflow/compiler/xla/service/gpu/thunk_emitter.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
index 13d32672a95..49157bbc9d4 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
+#if !TENSORFLOW_USE_ROCM
+  #include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h"
+#endif
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
@@ -237,6 +239,7 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     return Status::OK();
   }
 
+#if !TENSORFLOW_USE_ROCM
   if (custom_call->custom_call_target() == kCusolverCholeskyCallTarget) {
     TF_ASSIGN_OR_RETURN(CholeskyOptions options,
                         custom_call->backend_config<CholeskyOptions>());
@@ -280,6 +283,7 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
 
     return Status::OK();
   }
+#endif
 
   if (IsCublasGemm(*custom_call)) {
     AddThunkToThunkSequence(BuildGemmThunk(custom_call));