From c14b6951de82cd4c4957ccb181ef2946a8309ff1 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 3 Jan 2020 15:33:23 +0000
Subject: [PATCH 1/4] [ROCm] adding ROCm specific versions of "expected"
 results for FileCheck

---
 .../xla/service/gpu/tests/gpu_codegen_test.cc |   6 +
 .../xla/service/gpu/tests/gpu_codegen_test.h  |   7 +
 .../xla/service/gpu/tests/gpu_ftz_test.cc     |  20 ++
 .../xla/service/gpu/tests/gpu_index_test.cc   |  17 +-
 .../gpu/tests/gpu_input_fusible_slice_test.cc |  33 ++-
 .../gpu/tests/gpu_kernel_tiling_test.cc       | 225 ++++++++++++++----
 .../xla/service/gpu/tests/gpu_ldg_test.cc     |  15 ++
 7 files changed, 263 insertions(+), 60 deletions(-)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 36ff644fb2d..ce62fe205ab 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -51,9 +51,15 @@ void GpuCodegenTest::CompileAndVerifyPtx(
   std::unique_ptr<Executable> executable =
       std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
   string ptx_str(static_cast<GpuExecutable*>(executable.get())->text());
+
+  // On the ROCM platform the "ptx" string is not populated for the compiled
+  // executable, and hence the "ptx_str" will be empty. So disabling the
+  // pattern check on the ROCm platform
+#if !defined(TENSORFLOW_USE_ROCM)
   StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
   ASSERT_TRUE(filecheck_result.ok());
   EXPECT_TRUE(filecheck_result.ValueOrDie());
+#endif
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index 83cce1ccd3c..5f5b21150c1 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -27,6 +27,11 @@ namespace gpu {
 
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
+ public:
+  GpuCodegenTest()
+      : is_built_with_rocm_(
+            se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {}
+
  protected:
   // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
   // the ftz option.
@@ -36,6 +41,8 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
   void CompileAndVerifyPtx(std::unique_ptr<VerifiedHloModule> hlo_module,
                            absl::string_view pattern);
+
+  bool is_built_with_rocm_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index e2a2d127eff..1e95119d7ae 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -76,6 +76,11 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 };
 
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.f32
@@ -83,6 +88,11 @@ TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
     CHECK-NOT: mul.rn.f32
   )");
 }
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.ftz.f32
@@ -96,6 +106,11 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 // calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
 // when ftz is off, we get one call to the ftz version and one call to the
 // regular version.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
   CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
@@ -107,6 +122,11 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) {
   )");
 }
 
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
   CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index 177e43309c3..3dd250c1d1d 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -105,13 +105,24 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
                     .ValueOrDie();
 
   // Check the optimized IR reuses the linear index by calculating modulo 14.
-  CompileAndVerifyIr(std::move(module),
-                     R"(
+
+  // In the IR generated for AMDGPUs, we do not seem to have the
+  // the addrspace(1) attribute for the lines being checked by the following
+  // patterns still need to investigate why that is the case, and whether or not
+  // it is ok
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
+; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float*
+; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]]
+      )"
+                                         : R"(
 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
 ; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
 ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
-      )",
+      )";
+  CompileAndVerifyIr(std::move(module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
index 7f345c19331..369060897df 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
@@ -63,12 +63,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithOnlyOneSlice) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice0
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice0
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@@ -100,12 +105,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
@@ -142,12 +152,17 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: slice2
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: slice2
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/false);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index ae10fb161d6..a12df5f1010 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -63,12 +63,19 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -90,12 +97,17 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @copy
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @copy
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -134,12 +146,17 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -169,12 +186,17 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -205,12 +227,17 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -233,12 +260,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 }
 
@@ -261,12 +293,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -297,12 +334,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK: call void @llvm.amdgcn.s.barrier()
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
@@ -329,14 +371,31 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -376,13 +435,25 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -424,8 +495,34 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+;
+; CHECK-NOT: cmpxchg
+;
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK: atomicrmw fadd float
 ; CHECK: atomicrmw fadd float
@@ -433,7 +530,8 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
 ; CHECK: atomicrmw fadd float
 ; CHECK-NOT: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -459,12 +557,20 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -491,12 +597,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -524,12 +635,20 @@ TEST_F(GpuKernelTilingTest,
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-LABEL: atomic_op_loop_body{{.*}}:
+; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
+; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
+; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK: atomicrmw fadd float
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
@@ -570,12 +689,17 @@ TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-NOT: reduce.0.loop_header
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @fusion
 ; CHECK-NOT: reduce.0.loop_header
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
   // Check that the kernel runs correctly.
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
@@ -601,12 +725,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
   auto hlo_module =
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .ValueOrDie();
-  CompileAndVerifyIr(std::move(hlo_module),
-                     R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+; CHECK-LABEL: define amdgpu_kernel void @reduce
+; CHECK-NOT: call i32 @llvm.amdgcn.ds.bpermute
+; CHECK: }
+)"
+                                         : R"(
 ; CHECK-LABEL: define void @reduce
 ; CHECK-NOT: call float @llvm.nvvm.shfl.sync.down.f32
 ; CHECK: }
-)",
+)";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 8b844e66b90..3b19b50eece 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -38,6 +38,11 @@ class GpuLdgTest : public GpuCodegenTest {};
 
 // Parameters are never overwritten, so parameter reads should get ld.global.nc
 // reads.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForParamRead) {
   HloComputation::Builder builder(TestName());
 
@@ -60,6 +65,11 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
 // Check that reading a buffer produced by a non-parameter HLO also results in
 // ld.global.nc, if that buffer isn't modified within the instruction that reads
 // it.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, LdgForNonParamRead) {
   HloComputation::Builder builder(TestName());
 
@@ -94,6 +104,11 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // It seems like a fair bet that we won't start fusing sin into the output of
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
+//
+// On the ROCM platform the "ptx" string is not populated for the compiled
+// executable, and hence the call to CompileAdnVerifyPtx does not do the
+// "VerifyPtx" part, it merely compiles the executable
+//
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
   auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());

From 88a1e3b399d7f46cc33ed9a6d14f1873e292bf36 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 3 Jan 2020 18:11:27 +0000
Subject: [PATCH 2/4] [ROCm] Fix to enable XLA_GPU device registration for ROCm
 platform

---
 tensorflow/compiler/jit/xla_gpu_device.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 91943edd775..16f496d51a3 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
-// operators using XLA via the XLA "CUDA" (GPU) backend.
+// operators using XLA via the XLA "CUDA" or "ROCM" (GPU) backend.
 
 #include <set>
 
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -69,7 +70,8 @@ Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
     return Status::OK();
   }
 
-  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto platform =
+      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
   if (!platform.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
@@ -117,7 +119,8 @@ Status XlaGpuDeviceFactory::CreateDevices(
       RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT);
   (void)registrations;
 
-  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
+  auto platform =
+      se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName());
   if (!platform.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();

From 11b85f74734aa3cc2df422aec8a758d91d2ae1e0 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 2 Jan 2020 21:42:54 +0000
Subject: [PATCH 3/4] [ROCm] Adding no_rocm tag to XLA tests that fail on the
 ROCm platform

---
 tensorflow/compiler/tests/BUILD               | 29 +++++++++++++++----
 tensorflow/compiler/tests/build_defs.bzl      |  3 +-
 .../compiler/xla/service/mlir_gpu/tests/BUILD |  2 +-
 tensorflow/compiler/xla/tests/BUILD           | 28 +++++++++++++++---
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 4c3dcd81eb7..3ec240357c9 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -240,7 +240,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["cholesky_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -297,7 +300,10 @@ tf_xla_py_test(
         "cpu_ondemand",
     ],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -382,7 +388,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["concat_ops_test.py"],
     python_version = "PY3",
-    tags = ["many_xla_args"],
+    tags = [
+        "many_xla_args",
+        "no_rocm",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -568,7 +577,10 @@ tf_xla_py_test(
     srcs = ["fft_test.py"],
     python_version = "PY3",
     shard_count = 6,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -845,7 +857,10 @@ tf_xla_py_test(
     srcs = ["unstack_test.py"],
     python_version = "PY3",
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1292,6 +1307,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["jit_test.py"],
     shard_count = 5,
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":test_utils",
@@ -1312,6 +1328,7 @@ cuda_py_test(
     name = "dense_layer_test",
     size = "medium",
     srcs = ["dense_layer_test.py"],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":test_utils",
@@ -1396,6 +1413,7 @@ py_library(
 cuda_py_test(
     name = "lstm_test",
     srcs = ["lstm_test.py"],
+    tags = ["no_rocm"],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":lstm",
@@ -1498,6 +1516,7 @@ tf_xla_py_test(
     srcs = ["conv_node_name_test.py"],
     python_version = "PY3",
     shard_count = 5,
+    tags = ["no_rocm"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 04cb2a0b975..277efd1f013 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -1,6 +1,7 @@
 """Build rules for Tensorflow/XLA testing."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "rocm_is_configured")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -10,7 +11,7 @@ load(
 
 def all_backends():
     b = ["cpu"] + plugins.keys()
-    if cuda_is_configured():
+    if cuda_is_configured() or rocm_is_configured():
         return b + ["gpu"]
     else:
         return b
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index fded1859e33..16077260607 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -21,7 +21,7 @@ package_group(
 tf_cc_test(
     name = "mlir_gpu_lhlo_gen_test",
     srcs = ["mlir_gpu_lhlo_gen_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
     deps = [
         "//tensorflow/compiler/xla/service:mlir_gpu_plugin",
         "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base",
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index b2cc8050c42..35aad4cab47 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -587,6 +587,7 @@ xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
     shard_count = 2,
+    tags = ["no_rocm"],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -625,6 +626,7 @@ xla_test(
     name = "scalar_computations_test",
     srcs = ["scalar_computations_test.cc"],
     shard_count = 32,
+    tags = ["no_rocm"],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:literal",
@@ -924,6 +926,7 @@ xla_test(
     srcs = ["dot_operation_test.cc"],
     shard_count = 20,
     tags = [
+        "no_rocm",
         "optonly",
     ],
     deps = [
@@ -957,6 +960,7 @@ xla_test(
     backends = ["gpu"],
     shard_count = 20,
     tags = [
+        "no_rocm",
         "optonly",
     ],
     deps = [
@@ -1019,7 +1023,10 @@ xla_test(
         ],
     },
     shard_count = 20,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:array2d",
@@ -1113,7 +1120,10 @@ xla_test(
     timeout = "long",
     srcs = ["convolution_test.cc"],
     shard_count = 40,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1130,7 +1140,10 @@ xla_test(
     args = ["--xla_gpu_disable_autotune"],
     backends = ["gpu"],
     shard_count = 40,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1144,6 +1157,7 @@ xla_test(
     backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
     backends = ["gpu"],
     shard_count = 25,
+    tags = ["no_rocm"],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1213,6 +1227,7 @@ xla_test(
         "interpreter",
     ],
     shard_count = 40,
+    tags = ["no_rocm"],
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -1418,6 +1433,7 @@ xla_test(
     srcs = ["reduce_test.cc"],
     shard_count = 31,
     tags = [
+        "no_rocm",
         "optonly",
     ],
     deps = [
@@ -1497,6 +1513,7 @@ xla_test(
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
     tags = [
+        "no_rocm",
         "optonly",
     ],
     deps = [
@@ -2543,7 +2560,10 @@ xla_test(
 xla_test(
     name = "cholesky_test",
     srcs = ["cholesky_test.cc"],
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:array2d",

From 3e4a3d5c83be06c05a0522edd5c5690dda5efa3b Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 16 Jan 2020 02:48:17 +0000
Subject: [PATCH 4/4] changes to address code review feedback

---
 .../xla/service/gpu/tests/gpu_codegen_test.cc | 12 ++++----
 .../xla/service/gpu/tests/gpu_codegen_test.h  |  7 +++--
 .../xla/service/gpu/tests/gpu_ftz_test.cc     | 28 +++----------------
 .../xla/service/gpu/tests/gpu_index_test.cc   | 20 +++++--------
 .../xla/service/gpu/tests/gpu_ldg_test.cc     |  6 ++--
 5 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index ce62fe205ab..e9af2336922 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -46,7 +46,7 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) {
       ShapeUtil::ByteSizeOfElements);
 }
 
-void GpuCodegenTest::CompileAndVerifyPtx(
+void GpuCodegenTest::CompileAndOptionallyVerifyPtx(
     std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern) {
   std::unique_ptr<Executable> executable =
       std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
@@ -55,11 +55,11 @@ void GpuCodegenTest::CompileAndVerifyPtx(
   // On the ROCM platform the "ptx" string is not populated for the compiled
   // executable, and hence the "ptx_str" will be empty. So disabling the
   // pattern check on the ROCm platform
-#if !defined(TENSORFLOW_USE_ROCM)
-  StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
-  ASSERT_TRUE(filecheck_result.ok());
-  EXPECT_TRUE(filecheck_result.ValueOrDie());
-#endif
+  if (!is_built_with_rocm_) {
+    StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
+    ASSERT_TRUE(filecheck_result.ok());
+    EXPECT_TRUE(filecheck_result.ValueOrDie());
+  }
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index 5f5b21150c1..c187e90301d 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -39,8 +39,11 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
 
   // Compiles the given HLO module to PTX and verifies the PTX matches the given
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
-  void CompileAndVerifyPtx(std::unique_ptr<VerifiedHloModule> hlo_module,
-                           absl::string_view pattern);
+  // The "VerifyPtx" part only happens on the CUDA platform,
+  // and hence the "Optionally" in function name.
+  // For ROCm platform this routine will only do the "Compile" part.
+  void CompileAndOptionallyVerifyPtx(
+      std::unique_ptr<VerifiedHloModule> hlo_module, absl::string_view pattern);
 
   bool is_built_with_rocm_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 1e95119d7ae..282f7b24a31 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -76,25 +76,15 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 };
 
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
-//
-// On the ROCM platform the "ptx" string is not populated for the compiled
-// executable, and hence the call to CompileAdnVerifyPtx does not do the
-// "VerifyPtx" part, it merely compiles the executable
-//
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
-  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+  CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.f32
     CHECK: mul.rn.ftz.f32
     CHECK-NOT: mul.rn.f32
   )");
 }
-//
-// On the ROCM platform the "ptx" string is not populated for the compiled
-// executable, and hence the call to CompileAdnVerifyPtx does not do the
-// "VerifyPtx" part, it merely compiles the executable
-//
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
-  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+  CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
     CHECK-NOT: mul.rn.ftz.f32
     CHECK: mul.rn.f32
     CHECK-NOT: mul.rn.ftz.f32
@@ -106,13 +96,8 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
 // calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
 // when ftz is off, we get one call to the ftz version and one call to the
 // regular version.
-//
-// On the ROCM platform the "ptx" string is not populated for the compiled
-// executable, and hence the call to CompileAdnVerifyPtx does not do the
-// "VerifyPtx" part, it merely compiles the executable
-//
 TEST_F(GpuFtzEnabledTest, ExpFtz) {
-  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
     CHECK:     ex2.approx.ftz.f32
     CHECK-NOT: ex2.approx.f32
@@ -122,13 +107,8 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) {
   )");
 }
 
-//
-// On the ROCM platform the "ptx" string is not populated for the compiled
-// executable, and hence the call to CompileAdnVerifyPtx does not do the
-// "VerifyPtx" part, it merely compiles the executable
-//
 TEST_F(GpuFtzDisabledTest, ExpFtz) {
-  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+  CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
     CHECK-NOT: ex2.approx.f32
     CHECK-DAG: ex2.approx.ftz.f32
     CHECK-DAG: ex2.approx.f32
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index 3dd250c1d1d..67b291c8fcb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -108,21 +108,15 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
 
   // In the IR generated for AMDGPUs, we do not seem to have the
   // the addrspace(1) attribute for the lines being checked by the following
-  // patterns still need to investigate why that is the case, and whether or not
-  // it is ok
-  auto expected_ir = is_built_with_rocm_ ? R"(
+  // patterns.
+  // need to investigate why that is the case, and whether or not it is ok
+  CompileAndVerifyIr(std::move(module),
+                     R"(
 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
-; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float*
+; CHECK: %[[bitcast:.*]] = bitcast i8{{( addrspace\(1\))?}}* %[[alloc:.*]] to float{{( addrspace\(1\))?}}*
 ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
-; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]]
-      )"
-                                         : R"(
-; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
-; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
-; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
-; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
-      )";
-  CompileAndVerifyIr(std::move(module), expected_ir,
+; CHECK: getelementptr inbounds float, float{{( addrspace\(1\))?}}* %[[bitcast]], i64 %[[idx1]]
+      )",
                      /*match_optimized_ir=*/true);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 3b19b50eece..aca3cca7b11 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -56,7 +56,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
   auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
     CHECK-NOT: ld.global.f32
     CHECK: ld.global.nc.f32
   )");
@@ -86,7 +86,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
     CHECK: {
     CHECK-NOT: ld.global.f32
     CHECK: ld.global.nc.f32
@@ -143,7 +143,7 @@ TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
   std::unique_ptr<HloComputation> computation = builder.Build();
   hlo_module->AddEntryComputation(std::move(computation));
 
-  CompileAndVerifyPtx(std::move(hlo_module), R"(
+  CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
     CHECK-LABEL: .entry sin
     CHECK: {
     CHECK-NOT: ld.global.nc.f32