diff --git a/configure.py b/configure.py index 4dd34693267..f051fab1a96 100644 --- a/configure.py +++ b/configure.py @@ -1171,14 +1171,16 @@ def system_specific_test_config(environ_cp): test_only_filters = ['-oss_serial'] if is_windows(): test_and_build_filters.append('-no_windows') - if environ_cp.get('TF_NEED_CUDA', None) == '1': + if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or + (environ_cp.get('TF_NEED_ROCM', None) == '1')): test_and_build_filters += ['-no_windows_gpu', '-no_gpu'] else: test_and_build_filters.append('-gpu') elif is_macos(): test_and_build_filters += ['-gpu', '-nomac', '-no_mac'] elif is_linux(): - if environ_cp.get('TF_NEED_CUDA', None) == '1': + if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or + (environ_cp.get('TF_NEED_ROCM', None) == '1')): test_and_build_filters.append('-no_gpu') write_to_bazelrc('test --test_env=LD_LIBRARY_PATH') else: @@ -1416,6 +1418,10 @@ def main(): write_action_env_to_bazelrc('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH')) + if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')): + write_action_env_to_bazelrc('ROCM_PATH',environ_cp.get('ROCM_PATH')) + write_action_env_to_bazelrc('ROCM_ROOT',environ_cp.get('ROCM_PATH')) + environ_cp['TF_NEED_CUDA'] = str( int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False))) if (environ_cp.get('TF_NEED_CUDA') == '1' and diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm index c1928c8e504..6d124204ed8 100644 --- a/tensorflow/tools/ci_build/Dockerfile.rocm +++ b/tensorflow/tools/ci_build/Dockerfile.rocm @@ -3,8 +3,8 @@ FROM ubuntu:xenial MAINTAINER Jeff Poznanovic -ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.0/ -ARG ROCM_PATH=/opt/rocm +ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/ +ARG ROCM_PATH=/opt/rocm-3.3.0 ENV DEBIAN_FRONTEND noninteractive ENV TF_NEED_ROCM 1 @@ -71,7 +71,21 @@ ENV PATH="$ROCM_PATH/bin:${PATH}" ENV PATH="$OPENCL_ROOT/bin:${PATH}" # Add target file to help determine which device(s) to build for -RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst' +RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> ${ROCM_PATH}/bin/target.lst' + +# Need to explicitly create the $ROCM_PATH/.info/version file to workaround what seems to be a bazel bug +# The env vars being set via --action_env in .bazelrc and .tf_configure.bazelrc files are sometimes +# not getting set in the build command being spawned by bazel (in theory this should not happen) +# As a consequence ROCM_PATH is sometimes not set for the hipcc commands. +# When hipcc incokes hcc, it specifies $ROCM_PATH/.../include dirs via the `-isystem` options +# If ROCM_PATH is not set, it defaults to /opt/rocm, and as a consequence a dependency is generated on the +# header files included within `/opt/rocm`, which then leads to bazel dependency errors +# Explicitly creating the $ROCM_PATH/.info/version allows ROCM path to be set correrctly, even when ROCM_PATH +# is not explicitly set, and thus avoids the eventual bazel dependency error. +# The bazel bug needs to be root-caused and addressed, but that is out of our control and may take a long time +# to come to fruition, so implementing the workaround to make do till then +# Filed https://github.com/bazelbuild/bazel/issues/11163 for tracking this +RUN touch ${ROCM_PATH}/.info/version # Copy and run the install scripts. COPY install/*.sh /install/ @@ -90,3 +104,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc # Configure the build for our ROCm configuration. ENV TF_NEED_ROCM 1 +# This is a temporary workaround to fix Out-Of-Memory errors we are running into with XLA perf tests +# By default, HIP runtime "hides" 256MB from the TF Runtime, but with recent changes (update to ROCm2.3, dynamic loading of roc* libs, et al) +# it seems that we need to up the threshold slightly to 320MB +ENV HIP_HIDDEN_FREE_MEM=320 diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh index 08d99f41622..f1912c512a7 100755 --- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh +++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh @@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3` export CC_OPT_FLAGS='-mavx' export TF_NEED_ROCM=1 +export ROCM_PATH=/opt/rocm-3.3.0 export TF_GPU_COUNT=${N_GPUS} yes "" | $PYTHON_BIN_PATH configure.py @@ -42,7 +43,7 @@ bazel test \ --test_lang_filters=cc \ --jobs=${N_JOBS} \ --local_test_jobs=${TF_GPU_COUNT}\ - --test_timeout 300,450,1200,3600 \ + --test_timeout 600,900,2400,7200 \ --build_tests_only \ --test_output=errors \ --test_sharding_strategy=disabled \ diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh index 61813dfde30..df69044837f 100755 --- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh +++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh @@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3` export CC_OPT_FLAGS='-mavx' export TF_NEED_ROCM=1 +export ROCM_PATH=/opt/rocm-3.3.0 export TF_GPU_COUNT=${N_GPUS} yes "" | $PYTHON_BIN_PATH configure.py @@ -38,12 +39,13 @@ yes "" | $PYTHON_BIN_PATH configure.py bazel test \ --config=rocm \ -k \ - --test_tag_filters=gpu,-no_gpu,-no_rocm,-benchmark-test,-no_oss,-oss_serial,-rocm_multi_gpu, \ - --test_timeout 600,900,2400,7200 \ - --test_output=errors \ + --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \ --jobs=${N_JOBS} \ --local_test_jobs=${TF_GPU_COUNT} \ + --test_timeout 600,900,2400,7200 \ + --test_output=errors \ --test_sharding_strategy=disabled \ + --test_size_filters=small,medium \ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ -- \ //tensorflow/... \ diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh index 64bfffad149..b255789b865 100755 --- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh +++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh @@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3` export CC_OPT_FLAGS='-mavx' export TF_NEED_ROCM=1 +export ROCM_PATH=/opt/rocm-3.3.0 export TF_GPU_COUNT=${N_GPUS} yes "" | $PYTHON_BIN_PATH configure.py @@ -46,6 +47,7 @@ bazel test \ --build_tests_only \ --test_output=errors \ --test_sharding_strategy=disabled \ + --test_size_filters=small,medium \ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ -- \ //tensorflow/... \ diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh index 9288b7b3582..6ce1fad9cc7 100755 --- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh +++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh @@ -30,6 +30,7 @@ export PYTHON_BIN_PATH=`which python3` export CC_OPT_FLAGS='-mavx' export TF_NEED_ROCM=1 +export ROCM_PATH=/opt/rocm-3.3.0 export TF_GPU_COUNT=${N_GPUS} yes "" | $PYTHON_BIN_PATH configure.py @@ -47,6 +48,7 @@ bazel test \ --build_tests_only \ --test_output=errors \ --test_sharding_strategy=disabled \ + --test_size_filters=small,medium \ --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ -- \ //tensorflow/compiler/... \