Merge pull request #42689 from ROCmSoftwarePlatform:google_upstream_switch_to_rocm37

PiperOrigin-RevId: 333874384 Change-Id: Ic5e8b2394120d907e049f71cf5c00029ec75ad9f
2020-09-25 23:37:01 -07:00 · 2020-09-25 23:37:01 -07:00 · 0764c90855
commit 0764c90855
parent 065f6f6b59 d4b8e68a36
17 changed files with 117 additions and 101 deletions
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@ -230,9 +230,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
  {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
+    // Priority outside the range (-1, 1) for AMD GPUs
    SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 1}});
 #else
    // Priority outside the range (-2, 0) for NVidia GPUs
    SessionOptions opts =
@ -245,7 +245,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
    ExpectErrorMessageSubstr(
        status,
-        "Priority -1 is outside the range of supported priorities [0,2] for"
+        "Priority -2 is outside the range of supported priorities [-1,1] for"
        " virtual device 0 on GPU# 0");
 #else
    ExpectErrorMessageSubstr(
@ -254,8 +254,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
  }
  {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
+    // Priority outside the range (-1, 1) for AMD GPUs
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
    // Priority outside the range (-2, 0) for NVidia GPUs
    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
@ -267,7 +268,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
    ExpectErrorMessageSubstr(
        status,
-        "Priority 3 is outside the range of supported priorities [0,2] for"
+        "Priority 2 is outside the range of supported priorities [-1,1] for"
        " virtual device 0 on GPU# 0");
 #else
    ExpectErrorMessageSubstr(
@ -288,26 +289,17 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
 }

 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-#if TENSORFLOW_USE_ROCM
-  // Valid range for priority values on AMD GPUs in (0,2)
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
-#else
+  // Valid range for priority values on AMD GPUs in (-1,1)
  // Valid range for priority values on NVidia GPUs in (-2, 0)
  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
-#endif
  std::vector<std::unique_ptr<Device>> devices;
  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
      opts, kDeviceNamePrefix, &devices));
  EXPECT_EQ(2, devices.size());
  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
  EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-  EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
  ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
  ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
  EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@ -339,27 +331,18 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
  }
  {
    // Multile virtual devices with matching priority.
-#if TENSORFLOW_USE_ROCM
-    // Valid range for priority values on AMD GPUs in (0,2)
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
-#else
+    // Valid range for priority values on AMD GPUs in (-1,1)
    // Valid range for priority values on NVidia GPUs in (-2, 0)
    SessionOptions opts =
        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
-#endif
    std::vector<std::unique_ptr<Device>> devices;
    TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
        opts, kDeviceNamePrefix, &devices));
    EXPECT_EQ(2, devices.size());
    EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
    EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-    EXPECT_EQ(2, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-    EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
    EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
    EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
  }
 }

--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@ -27,15 +27,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/platform/types.h"
-
 #include "tensorflow/core/platform/logging.h"
-
-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
+#include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace functor {

--- a/tensorflow/core/kernels/rnn/blas_gemm.h
+++ b/tensorflow/core/kernels/rnn/blas_gemm.h
@ -25,11 +25,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif

-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -5414,6 +5414,7 @@ cuda_py_test(
    python_version = "PY3",
    shard_count = 10,
    tags = [
+        "no_rocm",
        "no_windows_gpu",
        "noasan",  # b/159332048
        "nomsan",  # b/148630708
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@ -157,6 +157,7 @@ cuda_py_test(
    size = "medium",
    srcs = ["adadelta_test.py"],
    shard_count = 4,
+    tags = ["no_rocm"],
    tfrt_enabled = True,
    deps = [
        ":optimizer_v2",
@ -298,6 +299,7 @@ cuda_py_test(
    size = "medium",
    srcs = ["rmsprop_test.py"],
    shard_count = 2,
+    tags = ["no_rocm"],
    tfrt_enabled = True,
    deps = [
        ":optimizer_v2",
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@ -82,6 +82,11 @@ class MathTest(PForTestCase, parameterized.TestCase):
    self._test_unary_cwise_ops(complex_ops, True)

  def test_unary_cwise_real_ops_1(self):
+    if test.is_built_with_rocm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest("Fails on ROCM")
+
    real_ops = [
        lambda x: math_ops.acosh(1 + math_ops.square(x)),
        math_ops.abs,
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@ -139,6 +139,11 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
      ]
      )  # pyformat: disable
  def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
+    if test_util.IsBuiltWithROCm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest('Fails on ROCM')
+
    result = op(x, **extra_args)

    # Run the wrapped op on the dense values, for comparison.
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@ -140,7 +140,7 @@ port::StatusOr<void*> GetHipsparseDsoHandle() {
  return GetDsoHandle("hipsparse", "");
 }

-port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }

 }  // namespace DsoLoader

--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@ -113,9 +113,6 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) {
    case miopenConvolutionFwdAlgoImplicitGEMM:
      s = "Implicit GEMM";
      break;
-    case miopenConvolutionFwdAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
  }
  return s;
 }
@ -182,9 +179,6 @@ string ToString(miopenConvAlgorithm_t algorithm) {
    case miopenConvolutionAlgoImplicitGEMM:
      s = "Implicit GEMM";
      break;
-    case miopenConvolutionAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
  }
  return s;
 }
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@ -3,8 +3,10 @@
 FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>

-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
-ARG ROCM_PATH=/opt/rocm-3.3.0
+ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/3.7/
+ARG ROCM_BUILD_NAME=xenial
+ARG ROCM_BUILD_NUM=main
+ARG ROCM_PATH=/opt/rocm-3.7.0

 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
@ -13,8 +15,12 @@ RUN apt update && apt install -y wget software-properties-common

 # Add rocm repository
 RUN apt-get clean all
-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
-RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
+RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/*  ]] ; then \
+      wget -qO - $ROCM_DEB_REPO/rocm.gpg.key | apt-key add -; \
+      echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \
+    else \
+      echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \
+    fi'

 # Install misc pkgs
 RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@ -18,20 +18,27 @@
 set -e
 set -x

-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""

+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'

 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR

 yes "" | $PYTHON_BIN_PATH configure.py

@ -39,15 +46,17 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test \
      --config=rocm \
      -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-multi_gpu,-v1only \
      --test_lang_filters=cc \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT}\
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
      --test_sharding_strategy=disabled \
-      --test_size_filters=small,medium \
+      --test_size_filters=small,medium,large \
      --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
      -- \
      //tensorflow/... \
@ -59,11 +68,14 @@ bazel test \
      --config=rocm \
      -k \
      --test_tag_filters=gpu \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
      --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium,large \
      -- \
      //tensorflow/core/nccl:nccl_manager_test
--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@ -18,20 +18,27 @@
 set -e
 set -x

-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""

+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'

 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR

 yes "" | $PYTHON_BIN_PATH configure.py

@ -40,8 +47,10 @@ bazel test \
      --config=rocm \
      -k \
      --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --test_output=errors \
      --test_sharding_strategy=disabled \
@ -60,8 +69,8 @@ bazel test \
      --test_tag_filters=gpu \
      --test_timeout 600,900,2400,7200 \
      --test_output=errors \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
      --test_sharding_strategy=disabled \
      -- \
      //tensorflow/core/nccl:nccl_manager_test
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@ -18,20 +18,27 @@
 set -e
 set -x

-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""

+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'

 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR

 yes "" | $PYTHON_BIN_PATH configure.py

@ -41,8 +48,10 @@ bazel test \
      -k \
      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
      --test_lang_filters=py \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@ -18,20 +18,27 @@
 set -e
 set -x

-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})

 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""

+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'

 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR

 yes "" | $PYTHON_BIN_PATH configure.py
 echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
@ -41,9 +48,11 @@ bazel test \
      --config=rocm \
      --config=xla \
      -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
@ -65,9 +74,11 @@ bazel test \
      --config=rocm \
      --config=xla \
      -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
      --test_timeout 600,900,2400,7200 \
      --build_tests_only \
      --test_output=errors \
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@ -34,8 +34,6 @@ HIPCC_ENV = '%{hipcc_env}'
 HIPCC_IS_HIPCLANG = '%{hipcc_is_hipclang}'=="True"
 HIP_RUNTIME_PATH = '%{hip_runtime_path}'
 HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
-HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
-HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
 ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
 ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
 VERBOSE = '%{crosstool_verbose}'=='1'
@ -267,11 +265,6 @@ def main():
    gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
    gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
    gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
-    # do not link with HCC runtime library in case hip-clang toolchain is used
-    if not HIPCC_IS_HIPCLANG:
-      gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
    gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
    gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
    gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@ -390,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
    libs_paths = [
        (name, _rocm_lib_paths(repository_ctx, name, path))
        for name, path in [
-            ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
+            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
            ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
            ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
            ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
@ -646,7 +646,7 @@ def _create_local_rocm_repository(repository_ctx):
        "rocm/BUILD",
        tpl_paths["rocm:BUILD"],
        {
-            "%{hip_lib}": rocm_libs["hip_hcc"].file_name,
+            "%{hip_lib}": rocm_libs["amdhip64"].file_name,
            "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
            "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
            "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
@ -733,9 +733,7 @@ def _create_local_rocm_repository(repository_ctx):
            "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
            "%{rocr_runtime_library}": "hsa-runtime64",
            "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
-            "%{hip_runtime_library}": "hip_hcc",
-            "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib",
-            "%{hcc_runtime_library}": "mcwamp",
+            "%{hip_runtime_library}": "amdhip64",
            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
            "%{gcc_host_compiler_path}": str(cc),
        },
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@ -12,6 +12,6 @@ container_digests = {
    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
-    "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:ac52a60d12d0c9f81e558782b5431127b93bb1a13dab7294b3a5b3de91173019",
+    "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:8c6ba5a831c23906716cc9e9c201081f2b5632e3bf3cbc0207da0ddbef18d525",
    "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }