diff --git a/configure.py b/configure.py index 8ec47294b47..4d1964f427c 100644 --- a/configure.py +++ b/configure.py @@ -1170,7 +1170,8 @@ def system_specific_test_config(env): write_to_bazelrc('test --test_tag_filters=-gpu,-nomac,-no_mac') write_to_bazelrc('test --build_tag_filters=-gpu,-nomac,-no_mac') elif is_linux(): - if env.get('TF_NEED_CUDA', None) == '1': + if ((env.get('TF_NEED_CUDA', None) == '1') or + (env.get('TF_NEED_ROCM', None) == '1')): write_to_bazelrc('test --test_tag_filters=-no_gpu') write_to_bazelrc('test --build_tag_filters=-no_gpu') write_to_bazelrc('test --test_env=LD_LIBRARY_PATH') @@ -1414,6 +1415,10 @@ def main(): write_action_env_to_bazelrc('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH')) + if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')): + write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH')) + write_action_env_to_bazelrc('ROCM_ROOT', environ_cp.get('ROCM_PATH')) + environ_cp['TF_NEED_CUDA'] = str( int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False))) if (environ_cp.get('TF_NEED_CUDA') == '1' and diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm index 70029d2a9a9..130d198ece0 100644 --- a/tensorflow/tools/ci_build/Dockerfile.rocm +++ b/tensorflow/tools/ci_build/Dockerfile.rocm @@ -62,6 +62,7 @@ RUN apt-get update --allow-insecure-repositories && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Set up paths ENV HCC_HOME=$ROCM_PATH/hcc ENV HIP_PATH=$ROCM_PATH/hip ENV OPENCL_ROOT=$ROCM_PATH/opencl @@ -70,7 +71,21 @@ ENV PATH="$ROCM_PATH/bin:${PATH}" ENV PATH="$OPENCL_ROOT/bin:${PATH}" # Add target file to help determine which device(s) to build for -RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst' +RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> ${ROCM_PATH}/bin/target.lst' + +# Need to explicitly create the $ROCM_PATH/.info/version file to workaround what seems to be a bazel bug +# The env vars being set via --action_env in .bazelrc and .tf_configure.bazelrc files are sometimes +# not getting set in the build command being spawned by bazel (in theory this should not happen) +# As a consequence ROCM_PATH is sometimes not set for the hipcc commands. +# When hipcc incokes hcc, it specifies $ROCM_PATH/.../include dirs via the `-isystem` options +# If ROCM_PATH is not set, it defaults to /opt/rocm, and as a consequence a dependency is generated on the +# header files included within `/opt/rocm`, which then leads to bazel dependency errors +# Explicitly creating the $ROCM_PATH/.info/version allows ROCM path to be set correrctly, even when ROCM_PATH +# is not explicitly set, and thus avoids the eventual bazel dependency error. +# The bazel bug needs to be root-caused and addressed, but that is out of our control and may take a long time +# to come to fruition, so implementing the workaround to make do till then +# Filed https://github.com/bazelbuild/bazel/issues/11163 for tracking this +RUN touch ${ROCM_PATH}/.info/version # Copy and run the install scripts. COPY install/*.sh /install/ @@ -89,3 +104,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc # Configure the build for our ROCm configuration. ENV TF_NEED_ROCM 1 +# This is a temporary workaround to fix Out-Of-Memory errors we are running into with XLA perf tests +# By default, HIP runtime "hides" 256MB from the TF Runtime, but with recent changes (update to ROCm2.3, dynamic loading of roc* libs, et al) +# it seems that we need to up the threshold slightly to 320MB +ENV HIP_HIDDEN_FREE_MEM=320 diff --git a/third_party/gpus/rocm/rocm_config.h.tpl b/third_party/gpus/rocm/rocm_config.h.tpl index c5f25a845ca..957413b9acd 100644 --- a/third_party/gpus/rocm/rocm_config.h.tpl +++ b/third_party/gpus/rocm/rocm_config.h.tpl @@ -16,6 +16,6 @@ limitations under the License. #ifndef ROCM_ROCM_CONFIG_H_ #define ROCM_ROCM_CONFIG_H_ -#define TF_ROCM_TOOLKIT_PATH "/opt/rocm" +#define TF_ROCM_TOOLKIT_PATH "%{rocm_toolkit_path}" #endif // ROCM_ROCM_CONFIG_H_ diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl index 8a8728fa4f7..7f89b3ccbb4 100644 --- a/third_party/gpus/rocm_configure.bzl +++ b/third_party/gpus/rocm_configure.bzl @@ -22,7 +22,7 @@ load( _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH" _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX" -_ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH" +_ROCM_TOOLKIT_PATH = "ROCM_PATH" _TF_ROCM_VERSION = "TF_ROCM_VERSION" _TF_MIOPEN_VERSION = "TF_MIOPEN_VERSION" _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS" @@ -192,55 +192,55 @@ def _rocm_include_path(repository_ctx, rocm_config): inc_dirs.append(rocm_config.rocm_toolkit_path + "/include") # Add HSA headers - inc_dirs.append("/opt/rocm/hsa/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hsa/include") # Add HIP headers - inc_dirs.append("/opt/rocm/include/hip") - inc_dirs.append("/opt/rocm/include/hip/hcc_detail") - inc_dirs.append("/opt/rocm/hip/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/include/hip/hcc_detail") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hip/include") # Add HIP-Clang headers - inc_dirs.append("/opt/rocm/llvm/lib/clang/8.0/include") - inc_dirs.append("/opt/rocm/llvm/lib/clang/9.0.0/include") - inc_dirs.append("/opt/rocm/llvm/lib/clang/10.0.0/include") - inc_dirs.append("/opt/rocm/llvm/lib/clang/11.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/8.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/9.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/10.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/llvm/lib/clang/11.0.0/include") # Add rocrand and hiprand headers - inc_dirs.append("/opt/rocm/rocrand/include") - inc_dirs.append("/opt/rocm/hiprand/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocrand/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hiprand/include") # Add rocfft headers - inc_dirs.append("/opt/rocm/rocfft/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocfft/include") # Add rocBLAS headers - inc_dirs.append("/opt/rocm/rocblas/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/rocblas/include") # Add MIOpen headers - inc_dirs.append("/opt/rocm/miopen/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/miopen/include") # Add RCCL headers - inc_dirs.append("/opt/rocm/rccl/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/rccl/include") # Add hcc headers - inc_dirs.append("/opt/rocm/hcc/include") - inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/7.0.0/include/") - inc_dirs.append("/opt/rocm/hcc/lib/clang/7.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/7.0.0/include/") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/7.0.0/include") # Newer hcc builds use/are based off of clang 8.0.0. - inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/8.0.0/include/") - inc_dirs.append("/opt/rocm/hcc/lib/clang/8.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/8.0.0/include/") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/8.0.0/include") # Support hcc based off clang 9.0.0, included in ROCm2.2 - inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/9.0.0/include/") - inc_dirs.append("/opt/rocm/hcc/lib/clang/9.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/9.0.0/include/") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/9.0.0/include") # Support hcc based off clang 10.0.0, included in ROCm2.8 - inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/10.0.0/include/") - inc_dirs.append("/opt/rocm/hcc/lib/clang/10.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/10.0.0/include") # Support hcc based off clang 11.0.0, included in ROCm3.1 - inc_dirs.append("/opt/rocm/hcc/compiler/lib/clang/11.0.0/include/") - inc_dirs.append("/opt/rocm/hcc/lib/clang/11.0.0/include") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/compiler/lib/clang/11.0.0/include/") + inc_dirs.append(rocm_config.rocm_toolkit_path + "/hcc/lib/clang/11.0.0/include") return inc_dirs @@ -306,11 +306,12 @@ def _hipcc_env(repository_ctx): repository_ctx.os.environ[name].strip() + "\";") return hipcc_env.strip() -def _hipcc_is_hipclang(repository_ctx): +def _hipcc_is_hipclang(repository_ctx,rocm_config): """Returns if hipcc is based on hip-clang toolchain. Args: repository_ctx: The repository context. + rocm_config: The path to the hip compiler. Returns: A string "True" if hipcc is based on hip-clang toolchain. @@ -325,7 +326,7 @@ def _hipcc_is_hipclang(repository_ctx): # grep for "HIP_COMPILER=clang" in /opt/rocm/hip/lib/.hipInfo grep_result = _execute( repository_ctx, - ["grep", "HIP_COMPILER=clang", "/opt/rocm/hip/lib/.hipInfo"], + ["grep", "HIP_COMPILER=clang", rocm_config.rocm_toolkit_path + "/hip/lib/.hipInfo"], empty_stdout_fine = True, ) result = grep_result.stdout.strip() @@ -333,13 +334,14 @@ def _hipcc_is_hipclang(repository_ctx): return "True" return "False" -def _if_hipcc_is_hipclang(repository_ctx, if_true, if_false = []): +def _if_hipcc_is_hipclang(repository_ctx, rocm_config, if_true, if_false = []): """ Returns either the if_true or if_false arg based on whether hipcc is based on the hip-clang toolchain Args : repository_ctx: The repository context. + rocm_config: The path to the hip compiler. if_true : value to return if hipcc is hip-clang based if_false : value to return if hipcc is not hip-clang based (optional, defaults to empty list) @@ -347,7 +349,7 @@ def _if_hipcc_is_hipclang(repository_ctx, if_true, if_false = []): Returns : either the if_true arg or the of_False arg """ - if _hipcc_is_hipclang(repository_ctx) == "True": + if _hipcc_is_hipclang(repository_ctx,rocm_config) == "True": return if_true return if_false @@ -768,7 +770,7 @@ def _create_local_rocm_repository(repository_ctx): rocm_defines["%{host_compiler_prefix}"] = host_compiler_prefix - rocm_defines["%{linker_bin_path}"] = "/opt/rocm/hcc/compiler/bin" + rocm_defines["%{linker_bin_path}"] = rocm_config.rocm_toolkit_path + "/hcc/compiler/bin" # For gcc, do not canonicalize system header paths; some versions of gcc # pick the shortest possible path for system includes when creating the @@ -781,7 +783,7 @@ def _create_local_rocm_repository(repository_ctx): "-DTENSORFLOW_USE_ROCM=1", "-D__HIP_PLATFORM_HCC__", "-DEIGEN_USE_HIP", - ] + _if_hipcc_is_hipclang(repository_ctx, [ + ] + _if_hipcc_is_hipclang(repository_ctx, rocm_config, [ # # define "TENSORFLOW_COMPILER_IS_HIP_CLANG" when we are using clang # based hipcc to compile/build tensorflow @@ -823,14 +825,14 @@ def _create_local_rocm_repository(repository_ctx): "crosstool:clang/bin/crosstool_wrapper_driver_rocm", { "%{cpu_compiler}": str(cc), - "%{hipcc_path}": "/opt/rocm/bin/hipcc", + "%{hipcc_path}": rocm_config.rocm_toolkit_path + "/bin/hipcc", "%{hipcc_env}": _hipcc_env(repository_ctx), - "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx), - "%{rocr_runtime_path}": "/opt/rocm/lib", + "%{hipcc_is_hipclang}": _hipcc_is_hipclang(repository_ctx,rocm_config), + "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib", "%{rocr_runtime_library}": "hsa-runtime64", - "%{hip_runtime_path}": "/opt/rocm/hip/lib", + "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib", "%{hip_runtime_library}": "hip_hcc", - "%{hcc_runtime_path}": "/opt/rocm/hcc/lib", + "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib", "%{hcc_runtime_library}": "mcwamp", "%{crosstool_verbose}": _crosstool_verbose(repository_ctx), "%{gcc_host_compiler_path}": str(cc),