From c14b6951de82cd4c4957ccb181ef2946a8309ff1 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 3 Jan 2020 15:33:23 +0000
Subject: [PATCH] [ROCm] adding ROCm specific versions of "expected" results
 for FileCheck

---
 .../xla/service/gpu/tests/gpu_codegen_test.cc |   6 +
 .../xla/service/gpu/tests/gpu_codegen_test.h  |   7 +
 .../xla/service/gpu/tests/gpu_ftz_test.cc     |  20 ++
 .../xla/service/gpu/tests/gpu_index_test.cc   |  17 +-
 .../gpu/tests/gpu_input_fusible_slice_test.cc |  33 ++-
 .../gpu/tests/gpu_kernel_tiling_test.cc       | 225 ++++++++++++++----
 .../xla/service/gpu/tests/gpu_ldg_test.cc     |  15 ++
 7 files changed, 263 insertions(+), 60 deletions(-)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 36ff644fb2d..ce62fe205ab 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -51,9 +51,15 @@ void GpuCodegenTest::CompileAndVerifyPtx(
   std::unique_ptr<Executable> executable =
       std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
   string ptx_str(static_cast<GpuExecutable*>(executable.get())->text());
+
+  // On the ROCM platform the "ptx" string is not populated for the compiled
+  // executable, and hence the "ptx_str" will be empty. So disabling the
+  // pattern check on the ROCm platform
+#if !defined(TENSORFLOW_USE_ROCM)
   StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
   ASSERT_TRUE(filecheck_result.ok());
   EXPECT_TRUE(filecheck_result.ValueOrDie());
+#endif
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index 83cce1ccd3c..5f5b21150c1 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -27,6 +27,11 @@ namespace gpu {
 
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
+ public:
+  GpuCodegenTest()
+      : is_built_with_rocm_(
+            se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {}
+
  protected:
   // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
   // the ftz option.
@@ -36,6 +41,8 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
   void CompileAndVerifyPtx(std::unique_ptr<VerifiedHloModule> hlo_module,
                            absl::string_view pattern);
+
+  bool is_built_with_rocm_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index e2a2d127eff..1e95119d7ae 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -76,6 +76,11 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 };
 
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.f32
@@ -83,6 +88,11 @@ TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
     CHECK-NOT: mul.rn.f32
   )");
 }
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.ftz.f32
@@ -96,6 +106,11 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 // calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
 // when ftz is off, we get one call to the ftz version and one call to the
 // regular version.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
   CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
@@ -107,6 +122,11 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) {
   )");
 }
 
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
   CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index 177e43309c3..3dd250c1d1d 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -105,13 +105,24 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
                     .ValueOrDie();
 
   // Check the optimized IR reuses the linear index by calculating modulo 14.
-  CompileAndVerifyIr(std::move(module),
-                     R"(
+
+  // In the IR generated for AMDGPUs, we do not seem to have the
+  // the addrspace(1) attribute for the lines being checked by the following
+  // patterns still need to investigate why that is the case, and whether or not
+  // it is ok
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
+; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float*
+; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]]
+      )"
+                                         : R"(
 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
 ; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
 ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
-      )",
+      )";
+  CompileAndVerifyIr(std::move(module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
index 7f345c19331..369060897df 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
@@ -63,12 +63,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithOnlyOneSlice) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice0
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice0
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@@ -100,12 +105,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@@ -142,12 +152,17 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index ae10fb161d6..a12df5f1010 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -63,12 +63,19 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -90,12 +97,17 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -134,12 +146,17 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -169,12 +186,17 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -205,12 +227,17 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -233,12 +260,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -261,12 +293,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -297,12 +334,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
@@ -329,14 +371,31 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -376,13 +435,25 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -424,8 +495,34 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
@@ -433,7 +530,8 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -459,12 +557,20 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -491,12 +597,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -524,12 +635,20 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -570,12 +689,17 @@ TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: reduce.0.loop_header
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: reduce.0.loop_header
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -601,12 +725,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-NOT: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK-NOT: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 8b844e66b90..3b19b50eece 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -38,6 +38,11 @@ class GpuLdgTest : public GpuCodegenTest {};
 
 // Parameters are never overwritten, so parameter reads should get ld.global.nc
 // reads.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForParamRead) {
   HloComputation::Builder builder(TestName());
 
@@ -60,6 +65,11 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
 // Check that reading a buffer produced by a non-parameter HLO also results in
 // ld.global.nc, if that buffer isn't modified within the instruction that reads
 // it.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForNonParamRead) {
   HloComputation::Builder builder(TestName());
 
@@ -94,6 +104,11 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // It seems like a fair bet that we won't start fusing sin into the output of
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
   auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());