From b7ab9b9e5fcff44588763a2c97103b1ae6468e06 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 9 Aug 2019 17:36:19 +0000 Subject: [PATCH 1/2] Enable XLA JIT compiler on ROCm. - bazel changes to enable xla_gpu_device and xla_gpu_jit on ROCm. - Disable cusolver_context on ROCm. It has source code dependency to CUDA API. - Disable dependency to cholesky_thunk on ROCm. It has source code dependency to CUDA API. - Remove cudnn_conv_algorithm_picker from gpu_compiler_impl dependency list. It is conditionally dependent when CUDA is enabled. - Remove CUDA-specific header inclusions in collective_permute_thunk and custom_call_thunk. These 2 thunks actually work on ROCm. - Partially enable ptxas_utils.h to make things build on ROCm. Full-fledged solution is on PR #30884. --- tensorflow/compiler/jit/BUILD | 16 ++++++++++++++-- tensorflow/compiler/xla/service/gpu/BUILD | 9 ++++----- .../xla/service/gpu/collective_permute_thunk.cc | 1 - .../xla/service/gpu/custom_call_thunk.cc | 1 - 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 1ebfe235b4d..16848110ece 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -1,5 +1,8 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured") +load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm") +load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured") load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps") load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library") @@ -38,7 +41,10 @@ cc_library( ":xla_cpu_device", ":xla_cpu_jit", "//tensorflow/compiler/plugin", - ] + if_cuda([ + ] + if_cuda_is_configured([ + ":xla_gpu_device", + ":xla_gpu_jit", + ]) + if_rocm_is_configured([ ":xla_gpu_device", ":xla_gpu_jit", ]), @@ -61,7 +67,13 @@ cc_library( cc_library( name = "xla_gpu_jit", visibility = ["//visibility:public"], - deps = if_cuda([ + deps = if_cuda_is_configured([ + ":jit_compilation_passes", + "//tensorflow/compiler/jit/kernels:xla_ops", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops", + "//tensorflow/compiler/xla/service:gpu_plugin", + ]) + if_rocm_is_configured([ ":jit_compilation_passes", "//tensorflow/compiler/jit/kernels:xla_ops", "//tensorflow/compiler/tf2xla/kernels:xla_ops", diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 9f709ddb058..08a4903eea2 100755 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -731,9 +731,9 @@ tf_cc_test( cc_library( name = "cusolver_context", - srcs = ["cusolver_context.cc"], - hdrs = ["cusolver_context.h"], - deps = [ + srcs = if_cuda_is_configured(["cusolver_context.cc"]), + hdrs = if_cuda_is_configured(["cusolver_context.h"]), + deps = if_cuda_is_configured([ # LINT.IfChange "@local_config_cuda//cuda:cublas_headers", # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers) @@ -745,7 +745,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor:blas", "//tensorflow/stream_executor/cuda:cusolver_lib", - ], + ]), ) cc_library( @@ -1053,7 +1053,6 @@ cc_library( deps = [ ":alias_passthrough_params", ":cudnn_batchnorm_rewriter", - ":cudnn_conv_algorithm_picker", ":cudnn_conv_padding_legalization", ":cudnn_conv_rewriter", ":fusion_merger", diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc index 60301b4de64..2fe359861f8 100644 --- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc @@ -22,7 +22,6 @@ limitations under the License. #include "absl/algorithm/container.h" #include "absl/memory/memory.h" -#include "third_party/gpus/cuda/include/cuda_runtime_api.h" #include "tensorflow/compiler/xla/refcounting_hash_map.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc index 65673106391..85571804315 100644 --- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc @@ -16,7 +16,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/custom_call_thunk.h" #include "absl/strings/str_format.h" -#include "tensorflow/stream_executor/cuda/cuda_stream.h" #include "tensorflow/stream_executor/gpu/gpu_stream.h" namespace xla { From 4fd5b07bd7c0db869509803c6c2b0d9d13a82bb1 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Thu, 22 Aug 2019 16:41:57 +0000 Subject: [PATCH 2/2] Skip including cholesky_thunk in thunk_emitter on ROCm build. cholesky_thunk has source code dependency to cusolver_context which has source code dependency to CUDA-specific headers. Disable them on ROCm build. Since GOOGLE_CUDA macros is discouraged in XLA, use !TENSORFLOW_USE_ROCM instead. --- tensorflow/compiler/xla/service/gpu/thunk_emitter.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc index 13d32672a95..49157bbc9d4 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc @@ -17,7 +17,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/custom_call_target_registry.h" #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" -#include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h" +#if !TENSORFLOW_USE_ROCM + #include "tensorflow/compiler/xla/service/gpu/cholesky_thunk.h" +#endif #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h" #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h" @@ -237,6 +239,7 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) { return Status::OK(); } +#if !TENSORFLOW_USE_ROCM if (custom_call->custom_call_target() == kCusolverCholeskyCallTarget) { TF_ASSIGN_OR_RETURN(CholeskyOptions options, custom_call->backend_config()); @@ -280,6 +283,7 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) { return Status::OK(); } +#endif if (IsCublasGemm(*custom_call)) { AddThunkToThunkSequence(BuildGemmThunk(custom_call));