Merge pull request #35572 from ROCmSoftwarePlatform:google_upstream_rocm_platform_csb_fix_200103

PiperOrigin-RevId: 290583221 Change-Id: Id0a0e96dc4c4ce02edb03f59f5404fb36b75f313
2020-01-20 02:34:51 -08:00 · 2020-01-20 02:34:51 -08:00 · 46c271b15d
commit 46c271b15d
parent 9ea3ee2be7 3e4a3d5c83
13 changed files with 310 additions and 86 deletions
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -111,6 +111,7 @@ cc_library(
        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
        "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:gpu_init",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/

 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
-// operators using XLA via the XLA "CUDA" (GPU) backend.
+// operators using XLA via the XLA "CUDA" or "ROCM" (GPU) backend.

 #include <set>

@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/status.h"

 namespace tensorflow {
@ -69,7 +70,8 @@ Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
    return Status::OK();
  }

-  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto platform =
+      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
  if (!platform.ok()) {
    // Treat failures as non-fatal; there might not be a GPU in the machine.
    VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
@ -117,7 +119,8 @@ Status XlaGpuDeviceFactory::CreateDevices(
      RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT);
  (void)registrations;

-  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto platform =
+      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
  if (!platform.ok()) {
    // Treat failures as non-fatal; there might not be a GPU in the machine.
    VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@ -240,7 +240,10 @@ tf_xla_py_test(
    size = "medium",
    srcs = ["cholesky_op_test.py"],
    python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -297,7 +300,10 @@ tf_xla_py_test(
        "cpu_ondemand",
    ],
    python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -382,7 +388,10 @@ tf_xla_py_test(
    size = "medium",
    srcs = ["concat_ops_test.py"],
    python_version = "PY3",
-    tags = ["many_xla_args"],
+    tags = [
+        "many_xla_args",
+        "no_rocm",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -568,7 +577,10 @@ tf_xla_py_test(
    srcs = ["fft_test.py"],
    python_version = "PY3",
    shard_count = 6,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -845,7 +857,10 @@ tf_xla_py_test(
    srcs = ["unstack_test.py"],
    python_version = "PY3",
    shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
@ -1292,6 +1307,7 @@ cuda_py_test(
    size = "medium",
    srcs = ["jit_test.py"],
    shard_count = 5,
+    tags = ["no_rocm"],
    xla_enable_strict_auto_jit = False,
    deps = [
        ":test_utils",
@ -1312,6 +1328,7 @@ cuda_py_test(
    name = "dense_layer_test",
    size = "medium",
    srcs = ["dense_layer_test.py"],
+    tags = ["no_rocm"],
    xla_enable_strict_auto_jit = False,
    deps = [
        ":test_utils",
@ -1396,6 +1413,7 @@ py_library(
 cuda_py_test(
    name = "lstm_test",
    srcs = ["lstm_test.py"],
+    tags = ["no_rocm"],
    xla_enable_strict_auto_jit = False,
    deps = [
        ":lstm",
@ -1498,6 +1516,7 @@ tf_xla_py_test(
    srcs = ["conv_node_name_test.py"],
    python_version = "PY3",
    shard_count = 5,
+    tags = ["no_rocm"],
    deps = [
        ":xla_test",
        "//tensorflow/python:array_ops",
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@ -1,6 +1,7 @@
 """Build rules for Tensorflow/XLA testing."""

 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "rocm_is_configured")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
    "//tensorflow/core/platform:build_config_root.bzl",
@ -10,7 +11,7 @@ load(

 def all_backends():
    b = ["cpu"] + plugins.keys()
-    if cuda_is_configured():
+    if cuda_is_configured() or rocm_is_configured():
        return b + ["gpu"]
    else:
        return b
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@ -46,14 +46,20 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) {
      ShapeUtil::ByteSizeOfElements);
 }

-void GpuCodegenTest::CompileAndVerifyPtx(
+void GpuCodegenTest::CompileAndOptionallyVerifyPtx(
    std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern) {
  std::unique_ptr<Executable> executable =
      std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
  string ptx_str(static_cast<GpuExecutable*>(executable.get())->text());
-  StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
-  ASSERT_TRUE(filecheck_result.ok());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
+
+  // On the ROCM platform the "ptx" string is not populated for the compiled
+  // executable, and hence the "ptx_str" will be empty. So disabling the
+  // pattern check on the ROCm platform
+  if (!is_built_with_rocm_) {
+    StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
+    ASSERT_TRUE(filecheck_result.ok());
+    EXPECT_TRUE(filecheck_result.ValueOrDie());
+  }
 }

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@ -27,6 +27,11 @@ namespace gpu {

 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
+ public:
+  GpuCodegenTest()
+      : is_built_with_rocm_(
+            se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {}
+
 protected:
  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
  // the ftz option.
@ -34,8 +39,13 @@ class GpuCodegenTest : public LlvmIrGenTestBase {

  // Compiles the given HLO module to PTX and verifies the PTX matches the given
  // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
-  void CompileAndVerifyPtx(std::unique_ptr<VerifiedHloModule> hlo_module,
-                           absl::string_view pattern);
+  // The "VerifyPtx" part only happens on the CUDA platform,
+  // and hence the "Optionally" in function name.
+  // For ROCm platform this routine will only do the "Compile" part.
+  void CompileAndOptionallyVerifyPtx(
+      std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern);
+
+  bool is_built_with_rocm_;
 };

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@ -77,14 +77,14 @@ class GpuFtzDisabledTest : public GpuFtzTest {

 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
-  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+  CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
    CHECK-NOT: mul.rn.f32
    CHECK: mul.rn.ftz.f32
    CHECK-NOT: mul.rn.f32
  )");
 }
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
-  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+  CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
    CHECK-NOT: mul.rn.ftz.f32
    CHECK: mul.rn.f32
    CHECK-NOT: mul.rn.ftz.f32
@ -97,7 +97,7 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 // when ftz is off, we get one call to the ftz version and one call to the
 // regular version.
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
-  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
    CHECK-NOT: ex2.approx.f32
    CHECK:     ex2.approx.ftz.f32
    CHECK-NOT: ex2.approx.f32
@ -108,7 +108,7 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) {
 }

 TEST_F(GpuFtzDisabledTest, ExpFtz) {
-  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
    CHECK-NOT: ex2.approx.f32
    CHECK-DAG: ex2.approx.ftz.f32
    CHECK-DAG: ex2.approx.f32
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@ -105,12 +105,17 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
                    .ValueOrDie();

  // Check the optimized IR reuses the linear index by calculating modulo 14.
+
+  // In the IR generated for AMDGPUs, we do not seem to have the
+  // the addrspace(1) attribute for the lines being checked by the following
+  // patterns.
+  // need to investigate why that is the case, and whether or not it is ok
  CompileAndVerifyIr(std::move(module),
                     R"(
 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
-; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
+; CHECK: %[[bitcast:.*]] = bitcast i8{{( addrspace\(1\))?}}* %[[alloc:.*]] to float{{( addrspace\(1\))?}}*
 ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
-; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
+; CHECK: getelementptr inbounds float, float{{( addrspace\(1\))?}}* %[[bitcast]], i64 %[[idx1]]
      )",
                     /*match_optimized_ir=*/true);
 }
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
@ -63,12 +63,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithOnlyOneSlice) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice0
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice0
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/false);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@ -100,12 +105,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/false);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@ -142,12 +152,17 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/false);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@ -63,12 +63,19 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -90,12 +97,17 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
 }

@ -134,12 +146,17 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -169,12 +186,17 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -205,12 +227,17 @@ TEST_F(GpuKernelTilingTest,
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
 }

@ -233,12 +260,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
 }

@ -261,12 +293,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -297,12 +334,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
@ -329,14 +371,31 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@ -376,13 +435,25 @@ TEST_F(GpuKernelTilingTest,
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@ -424,8 +495,34 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
@ -433,7 +530,8 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@ -459,12 +557,20 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -491,12 +597,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -524,12 +635,20 @@ TEST_F(GpuKernelTilingTest,
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
@ -570,12 +689,17 @@ TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: reduce.0.loop_header
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: reduce.0.loop_header
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);
  // Check that the kernel runs correctly.
  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@ -601,12 +725,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
  auto hlo_module =
      ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
          .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-NOT: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK-NOT: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                     /*match_optimized_ir=*/true);

  // Check that the kernel runs correctly.
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@ -38,6 +38,11 @@ class GpuLdgTest : public GpuCodegenTest {};

 // Parameters are never overwritten, so parameter reads should get ld.global.nc
 // reads.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForParamRead) {
  HloComputation::Builder builder(TestName());

@ -51,7 +56,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
  auto hlo_module = CreateNewVerifiedModule();
  hlo_module->AddEntryComputation(std::move(computation));

-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
    CHECK-NOT: ld.global.f32
    CHECK: ld.global.nc.f32
  )");
@ -60,6 +65,11 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
 // Check that reading a buffer produced by a non-parameter HLO also results in
 // ld.global.nc, if that buffer isn't modified within the instruction that reads
 // it.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForNonParamRead) {
  HloComputation::Builder builder(TestName());

@ -76,7 +86,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
  auto hlo_module = CreateNewVerifiedModule();
  hlo_module->AddEntryComputation(std::move(computation));

-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
    CHECK: {
    CHECK-NOT: ld.global.f32
    CHECK: ld.global.nc.f32
@ -94,6 +104,11 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // It seems like a fair bet that we won't start fusing sin into the output of
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
  auto hlo_module = CreateNewVerifiedModule();
  HloComputation::Builder builder(TestName());
@ -128,7 +143,7 @@ TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
  std::unique_ptr<HloComputation> computation = builder.Build();
  hlo_module->AddEntryComputation(std::move(computation));

-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
    CHECK-LABEL: .entry sin
    CHECK: {
    CHECK-NOT: ld.global.nc.f32
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@ -21,7 +21,7 @@ package_group(
 tf_cc_test(
    name = "mlir_gpu_lhlo_gen_test",
    srcs = ["mlir_gpu_lhlo_gen_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
    deps = [
        "//tensorflow/compiler/xla/service:mlir_gpu_plugin",
        "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base",
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@ -607,6 +607,7 @@ xla_test(
    name = "conditional_test",
    srcs = ["conditional_test.cc"],
    shard_count = 2,
+    tags = ["no_rocm"],
    deps = [
        ":test_macros_header",
        "//tensorflow/compiler/xla:xla_data_proto_cc",
@ -645,6 +646,7 @@ xla_test(
    name = "scalar_computations_test",
    srcs = ["scalar_computations_test.cc"],
    shard_count = 32,
+    tags = ["no_rocm"],
    deps = [
        ":test_macros_header",
        "//tensorflow/compiler/xla:literal",
@ -944,6 +946,7 @@ xla_test(
    srcs = ["dot_operation_test.cc"],
    shard_count = 20,
    tags = [
+        "no_rocm",
        "optonly",
    ],
    deps = [
@ -977,6 +980,7 @@ xla_test(
    backends = ["gpu"],
    shard_count = 20,
    tags = [
+        "no_rocm",
        "optonly",
    ],
    deps = [
@ -1039,7 +1043,10 @@ xla_test(
        ],
    },
    shard_count = 20,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":test_macros_header",
        "//tensorflow/compiler/xla:array2d",
@ -1133,7 +1140,10 @@ xla_test(
    timeout = "long",
    srcs = ["convolution_test.cc"],
    shard_count = 40,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = CONVOLUTION_TEST_DEPS + [
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
@ -1150,7 +1160,10 @@ xla_test(
    args = ["--xla_gpu_disable_autotune"],
    backends = ["gpu"],
    shard_count = 40,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = CONVOLUTION_TEST_DEPS + [
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
@ -1164,6 +1177,7 @@ xla_test(
    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
    backends = ["gpu"],
    shard_count = 25,
+    tags = ["no_rocm"],
    deps = CONVOLUTION_TEST_DEPS + [
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
@ -1233,6 +1247,7 @@ xla_test(
        "interpreter",
    ],
    shard_count = 40,
+    tags = ["no_rocm"],
    deps = [
        ":client_library_test_base",
        ":hlo_test_base",
@ -1438,6 +1453,7 @@ xla_test(
    srcs = ["reduce_test.cc"],
    shard_count = 31,
    tags = [
+        "no_rocm",
        "optonly",
    ],
    deps = [
@ -1517,6 +1533,7 @@ xla_test(
    timeout = "long",
    srcs = ["select_and_scatter_test.cc"],
    tags = [
+        "no_rocm",
        "optonly",
    ],
    deps = [
@ -2563,7 +2580,10 @@ xla_test(
 xla_test(
    name = "cholesky_test",
    srcs = ["cholesky_test.cc"],
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
    deps = [
        ":test_macros_header",
        "//tensorflow/compiler/xla:array2d",