From c14b6951de82cd4c4957ccb181ef2946a8309ff1 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Fri, 3 Jan 2020 15:33:23 +0000 Subject: [PATCH 1/4] [ROCm] adding ROCm specific versions of "expected" results for FileCheck --- .../xla/service/gpu/tests/gpu_codegen_test.cc | 6 + .../xla/service/gpu/tests/gpu_codegen_test.h | 7 + .../xla/service/gpu/tests/gpu_ftz_test.cc | 20 ++ .../xla/service/gpu/tests/gpu_index_test.cc | 17 +- .../gpu/tests/gpu_input_fusible_slice_test.cc | 33 ++- .../gpu/tests/gpu_kernel_tiling_test.cc | 225 ++++++++++++++---- .../xla/service/gpu/tests/gpu_ldg_test.cc | 15 ++ 7 files changed, 263 insertions(+), 60 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc index 36ff644fb2d..ce62fe205ab 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc @@ -51,9 +51,15 @@ void GpuCodegenTest::CompileAndVerifyPtx( std::unique_ptr executable = std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie()); string ptx_str(static_cast(executable.get())->text()); + + // On the ROCM platform the "ptx" string is not populated for the compiled + // executable, and hence the "ptx_str" will be empty. So disabling the + // pattern check on the ROCm platform +#if !defined(TENSORFLOW_USE_ROCM) StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); ASSERT_TRUE(filecheck_result.ok()); EXPECT_TRUE(filecheck_result.ValueOrDie()); +#endif } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h index 83cce1ccd3c..5f5b21150c1 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h @@ -27,6 +27,11 @@ namespace gpu { // Tests that verify IR or PTX emitted by the GPU backend is as expected. class GpuCodegenTest : public LlvmIrGenTestBase { + public: + GpuCodegenTest() + : is_built_with_rocm_( + se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {} + protected: // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring // the ftz option. @@ -36,6 +41,8 @@ class GpuCodegenTest : public LlvmIrGenTestBase { // FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html). void CompileAndVerifyPtx(std::unique_ptr hlo_module, absl::string_view pattern); + + bool is_built_with_rocm_; }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc index e2a2d127eff..1e95119d7ae 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc @@ -76,6 +76,11 @@ class GpuFtzDisabledTest : public GpuFtzTest { }; // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise. +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuFtzEnabledTest, MultiplyFtz) { CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.f32 @@ -83,6 +88,11 @@ TEST_F(GpuFtzEnabledTest, MultiplyFtz) { CHECK-NOT: mul.rn.f32 )"); } +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuFtzDisabledTest, MultiplyFtz) { CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.ftz.f32 @@ -96,6 +106,11 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) { // calls to ex2.approx. When ftz is on, we get two calls to the ftz version; // when ftz is off, we get one call to the ftz version and one call to the // regular version. +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuFtzEnabledTest, ExpFtz) { CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 @@ -107,6 +122,11 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) { )"); } +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuFtzDisabledTest, ExpFtz) { CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc index 177e43309c3..3dd250c1d1d 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc @@ -105,13 +105,24 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) { .ValueOrDie(); // Check the optimized IR reuses the linear index by calculating modulo 14. - CompileAndVerifyIr(std::move(module), - R"( + + // In the IR generated for AMDGPUs, we do not seem to have the + // the addrspace(1) attribute for the lines being checked by the following + // patterns still need to investigate why that is the case, and whether or not + // it is ok + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 +; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float* +; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 +; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]] + )" + : R"( ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 ; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)* ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 ; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]] - )", + )"; + CompileAndVerifyIr(std::move(module), expected_ir, /*match_optimized_ir=*/true); } diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc index 7f345c19331..369060897df 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc @@ -63,12 +63,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithOnlyOneSlice) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: slice0 +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: slice0 ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/false); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0})); @@ -100,12 +105,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: slice2 +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: slice2 ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/false); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0})); @@ -142,12 +152,17 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: slice2 +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: slice2 ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/false); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0})); diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc index ae10fb161d6..a12df5f1010 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc @@ -63,12 +63,19 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @copy +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @copy ; CHECK: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -90,12 +97,17 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @copy +; CHECK-NOT: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @copy ; CHECK-NOT: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); } @@ -134,12 +146,17 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -169,12 +186,17 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -205,12 +227,17 @@ TEST_F(GpuKernelTilingTest, auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK-NOT: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK-NOT: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); } @@ -233,12 +260,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK-NOT: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK-NOT: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); } @@ -261,12 +293,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK-NOT: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK-NOT: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -297,12 +334,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK: call void @llvm.amdgcn.s.barrier() +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: call void @llvm.nvvm.barrier0() ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0})); @@ -329,14 +371,31 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-NOT: cmpxchg +; +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: atomicrmw fadd float ; CHECK: atomicrmw fadd float ; CHECK-NOT: atomicrmw fadd float ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5})); @@ -376,13 +435,25 @@ TEST_F(GpuKernelTilingTest, auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-NOT: cmpxchg +; +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: atomicrmw fadd float ; CHECK-NOT: atomicrmw fadd float ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5})); @@ -424,8 +495,34 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; +; CHECK-NOT: cmpxchg +; +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK: atomicrmw fadd float ; CHECK: atomicrmw fadd float @@ -433,7 +530,8 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) { ; CHECK: atomicrmw fadd float ; CHECK-NOT: atomicrmw fadd float ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5})); @@ -459,12 +557,20 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @ +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @ ; CHECK: atomicrmw fadd float ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -491,12 +597,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @reduce +; CHECK: call i32 @llvm.amdgcn.ds.bpermute +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @reduce ; CHECK: call float @llvm.nvvm.shfl.sync.down.f32 ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -524,12 +635,20 @@ TEST_F(GpuKernelTilingTest, auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @reduce +; CHECK-LABEL: atomic_op_loop_body{{.*}}: +; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}} +; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32 +; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]] +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @reduce ; CHECK: atomicrmw fadd float ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. @@ -570,12 +689,17 @@ TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @fusion +; CHECK-NOT: reduce.0.loop_header +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @fusion ; CHECK-NOT: reduce.0.loop_header ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5})); @@ -601,12 +725,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) { auto hlo_module = ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment()) .ValueOrDie(); - CompileAndVerifyIr(std::move(hlo_module), - R"( + auto expected_ir = is_built_with_rocm_ ? R"( +; CHECK-LABEL: define amdgpu_kernel void @reduce +; CHECK-NOT: call i32 @llvm.amdgcn.ds.bpermute +; CHECK: } +)" + : R"( ; CHECK-LABEL: define void @reduce ; CHECK-NOT: call float @llvm.nvvm.shfl.sync.down.f32 ; CHECK: } -)", +)"; + CompileAndVerifyIr(std::move(hlo_module), expected_ir, /*match_optimized_ir=*/true); // Check that the kernel runs correctly. diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc index 8b844e66b90..3b19b50eece 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc @@ -38,6 +38,11 @@ class GpuLdgTest : public GpuCodegenTest {}; // Parameters are never overwritten, so parameter reads should get ld.global.nc // reads. +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuLdgTest, LdgForParamRead) { HloComputation::Builder builder(TestName()); @@ -60,6 +65,11 @@ TEST_F(GpuLdgTest, LdgForParamRead) { // Check that reading a buffer produced by a non-parameter HLO also results in // ld.global.nc, if that buffer isn't modified within the instruction that reads // it. +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuLdgTest, LdgForNonParamRead) { HloComputation::Builder builder(TestName()); @@ -94,6 +104,11 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) { // It seems like a fair bet that we won't start fusing sin into the output of // reduce in the foreseeable future. But if that turns out to be wrong, I give // you, future reader, permission to delete this test. +// +// On the ROCM platform the "ptx" string is not populated for the compiled +// executable, and hence the call to CompileAdnVerifyPtx does not do the +// "VerifyPtx" part, it merely compiles the executable +// TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) { auto hlo_module = CreateNewVerifiedModule(); HloComputation::Builder builder(TestName()); From 88a1e3b399d7f46cc33ed9a6d14f1873e292bf36 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Fri, 3 Jan 2020 18:11:27 +0000 Subject: [PATCH 2/4] [ROCm] Fix to enable XLA_GPU device registration for ROCm platform --- tensorflow/compiler/jit/xla_gpu_device.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 91943edd775..16f496d51a3 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs -// operators using XLA via the XLA "CUDA" (GPU) backend. +// operators using XLA via the XLA "CUDA" or "ROCM" (GPU) backend. #include @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_device_ops.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/core/common_runtime/device_factory.h" +#include "tensorflow/core/common_runtime/gpu/gpu_init.h" #include "tensorflow/core/lib/core/status.h" namespace tensorflow { @@ -69,7 +70,8 @@ Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector* devices) { return Status::OK(); } - auto platform = se::MultiPlatformManager::PlatformWithName("CUDA"); + auto platform = + se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName()); if (!platform.ok()) { // Treat failures as non-fatal; there might not be a GPU in the machine. VLOG(1) << "Failed to create XLA_GPU device: " << platform.status(); @@ -117,7 +119,8 @@ Status XlaGpuDeviceFactory::CreateDevices( RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT); (void)registrations; - auto platform = se::MultiPlatformManager::PlatformWithName("CUDA"); + auto platform = + se::MultiPlatformManager::PlatformWithName(tensorflow::GpuPlatformName()); if (!platform.ok()) { // Treat failures as non-fatal; there might not be a GPU in the machine. VLOG(1) << "Failed to create XLA_GPU device: " << platform.status(); From 11b85f74734aa3cc2df422aec8a758d91d2ae1e0 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 2 Jan 2020 21:42:54 +0000 Subject: [PATCH 3/4] [ROCm] Adding no_rocm tag to XLA tests that fail on the ROCm platform --- tensorflow/compiler/tests/BUILD | 29 +++++++++++++++---- tensorflow/compiler/tests/build_defs.bzl | 3 +- .../compiler/xla/service/mlir_gpu/tests/BUILD | 2 +- tensorflow/compiler/xla/tests/BUILD | 28 +++++++++++++++--- 4 files changed, 51 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 4c3dcd81eb7..3ec240357c9 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -240,7 +240,10 @@ tf_xla_py_test( size = "medium", srcs = ["cholesky_op_test.py"], python_version = "PY3", - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -297,7 +300,10 @@ tf_xla_py_test( "cpu_ondemand", ], python_version = "PY3", - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -382,7 +388,10 @@ tf_xla_py_test( size = "medium", srcs = ["concat_ops_test.py"], python_version = "PY3", - tags = ["many_xla_args"], + tags = [ + "many_xla_args", + "no_rocm", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -568,7 +577,10 @@ tf_xla_py_test( srcs = ["fft_test.py"], python_version = "PY3", shard_count = 6, - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -845,7 +857,10 @@ tf_xla_py_test( srcs = ["unstack_test.py"], python_version = "PY3", shard_count = 5, - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":xla_test", "//tensorflow/python:array_ops", @@ -1292,6 +1307,7 @@ cuda_py_test( size = "medium", srcs = ["jit_test.py"], shard_count = 5, + tags = ["no_rocm"], xla_enable_strict_auto_jit = False, deps = [ ":test_utils", @@ -1312,6 +1328,7 @@ cuda_py_test( name = "dense_layer_test", size = "medium", srcs = ["dense_layer_test.py"], + tags = ["no_rocm"], xla_enable_strict_auto_jit = False, deps = [ ":test_utils", @@ -1396,6 +1413,7 @@ py_library( cuda_py_test( name = "lstm_test", srcs = ["lstm_test.py"], + tags = ["no_rocm"], xla_enable_strict_auto_jit = False, deps = [ ":lstm", @@ -1498,6 +1516,7 @@ tf_xla_py_test( srcs = ["conv_node_name_test.py"], python_version = "PY3", shard_count = 5, + tags = ["no_rocm"], deps = [ ":xla_test", "//tensorflow/python:array_ops", diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl index 04cb2a0b975..277efd1f013 100644 --- a/tensorflow/compiler/tests/build_defs.bzl +++ b/tensorflow/compiler/tests/build_defs.bzl @@ -1,6 +1,7 @@ """Build rules for Tensorflow/XLA testing.""" load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured") +load("@local_config_rocm//rocm:build_defs.bzl", "rocm_is_configured") load("//tensorflow/compiler/tests:plugin.bzl", "plugins") load( "//tensorflow/core/platform:build_config_root.bzl", @@ -10,7 +11,7 @@ load( def all_backends(): b = ["cpu"] + plugins.keys() - if cuda_is_configured(): + if cuda_is_configured() or rocm_is_configured(): return b + ["gpu"] else: return b diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD index fded1859e33..16077260607 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD +++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD @@ -21,7 +21,7 @@ package_group( tf_cc_test( name = "mlir_gpu_lhlo_gen_test", srcs = ["mlir_gpu_lhlo_gen_test.cc"], - tags = tf_cuda_tests_tags(), + tags = tf_cuda_tests_tags() + ["no_rocm"], deps = [ "//tensorflow/compiler/xla/service:mlir_gpu_plugin", "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base", diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index b2cc8050c42..35aad4cab47 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -587,6 +587,7 @@ xla_test( name = "conditional_test", srcs = ["conditional_test.cc"], shard_count = 2, + tags = ["no_rocm"], deps = [ ":test_macros_header", "//tensorflow/compiler/xla:xla_data_proto_cc", @@ -625,6 +626,7 @@ xla_test( name = "scalar_computations_test", srcs = ["scalar_computations_test.cc"], shard_count = 32, + tags = ["no_rocm"], deps = [ ":test_macros_header", "//tensorflow/compiler/xla:literal", @@ -924,6 +926,7 @@ xla_test( srcs = ["dot_operation_test.cc"], shard_count = 20, tags = [ + "no_rocm", "optonly", ], deps = [ @@ -957,6 +960,7 @@ xla_test( backends = ["gpu"], shard_count = 20, tags = [ + "no_rocm", "optonly", ], deps = [ @@ -1019,7 +1023,10 @@ xla_test( ], }, shard_count = 20, - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":test_macros_header", "//tensorflow/compiler/xla:array2d", @@ -1113,7 +1120,10 @@ xla_test( timeout = "long", srcs = ["convolution_test.cc"], shard_count = 40, - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = CONVOLUTION_TEST_DEPS + [ "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -1130,7 +1140,10 @@ xla_test( args = ["--xla_gpu_disable_autotune"], backends = ["gpu"], shard_count = 40, - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = CONVOLUTION_TEST_DEPS + [ "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -1144,6 +1157,7 @@ xla_test( backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]}, backends = ["gpu"], shard_count = 25, + tags = ["no_rocm"], deps = CONVOLUTION_TEST_DEPS + [ "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -1213,6 +1227,7 @@ xla_test( "interpreter", ], shard_count = 40, + tags = ["no_rocm"], deps = [ ":client_library_test_base", ":hlo_test_base", @@ -1418,6 +1433,7 @@ xla_test( srcs = ["reduce_test.cc"], shard_count = 31, tags = [ + "no_rocm", "optonly", ], deps = [ @@ -1497,6 +1513,7 @@ xla_test( timeout = "long", srcs = ["select_and_scatter_test.cc"], tags = [ + "no_rocm", "optonly", ], deps = [ @@ -2543,7 +2560,10 @@ xla_test( xla_test( name = "cholesky_test", srcs = ["cholesky_test.cc"], - tags = ["optonly"], + tags = [ + "no_rocm", + "optonly", + ], deps = [ ":test_macros_header", "//tensorflow/compiler/xla:array2d", From 3e4a3d5c83be06c05a0522edd5c5690dda5efa3b Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Thu, 16 Jan 2020 02:48:17 +0000 Subject: [PATCH 4/4] changes to address code review feedback --- .../xla/service/gpu/tests/gpu_codegen_test.cc | 12 ++++---- .../xla/service/gpu/tests/gpu_codegen_test.h | 7 +++-- .../xla/service/gpu/tests/gpu_ftz_test.cc | 28 +++---------------- .../xla/service/gpu/tests/gpu_index_test.cc | 20 +++++-------- .../xla/service/gpu/tests/gpu_ldg_test.cc | 6 ++-- 5 files changed, 25 insertions(+), 48 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc index ce62fe205ab..e9af2336922 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc @@ -46,7 +46,7 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) { ShapeUtil::ByteSizeOfElements); } -void GpuCodegenTest::CompileAndVerifyPtx( +void GpuCodegenTest::CompileAndOptionallyVerifyPtx( std::unique_ptr hlo_module, absl::string_view pattern) { std::unique_ptr executable = std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie()); @@ -55,11 +55,11 @@ void GpuCodegenTest::CompileAndVerifyPtx( // On the ROCM platform the "ptx" string is not populated for the compiled // executable, and hence the "ptx_str" will be empty. So disabling the // pattern check on the ROCm platform -#if !defined(TENSORFLOW_USE_ROCM) - StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); - ASSERT_TRUE(filecheck_result.ok()); - EXPECT_TRUE(filecheck_result.ValueOrDie()); -#endif + if (!is_built_with_rocm_) { + StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); + ASSERT_TRUE(filecheck_result.ok()); + EXPECT_TRUE(filecheck_result.ValueOrDie()); + } } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h index 5f5b21150c1..c187e90301d 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h @@ -39,8 +39,11 @@ class GpuCodegenTest : public LlvmIrGenTestBase { // Compiles the given HLO module to PTX and verifies the PTX matches the given // FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html). - void CompileAndVerifyPtx(std::unique_ptr hlo_module, - absl::string_view pattern); + // The "VerifyPtx" part only happens on the CUDA platform, + // and hence the "Optionally" in function name. + // For ROCm platform this routine will only do the "Compile" part. + void CompileAndOptionallyVerifyPtx( + std::unique_ptr hlo_module, absl::string_view pattern); bool is_built_with_rocm_; }; diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc index 1e95119d7ae..282f7b24a31 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc @@ -76,25 +76,15 @@ class GpuFtzDisabledTest : public GpuFtzTest { }; // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise. -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzEnabledTest, MultiplyFtz) { - CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.f32 CHECK: mul.rn.ftz.f32 CHECK-NOT: mul.rn.f32 )"); } -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzDisabledTest, MultiplyFtz) { - CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.ftz.f32 CHECK: mul.rn.f32 CHECK-NOT: mul.rn.ftz.f32 @@ -106,13 +96,8 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) { // calls to ex2.approx. When ftz is on, we get two calls to the ftz version; // when ftz is off, we get one call to the ftz version and one call to the // regular version. -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzEnabledTest, ExpFtz) { - CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 CHECK: ex2.approx.ftz.f32 CHECK-NOT: ex2.approx.f32 @@ -122,13 +107,8 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) { )"); } -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzDisabledTest, ExpFtz) { - CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 CHECK-DAG: ex2.approx.ftz.f32 CHECK-DAG: ex2.approx.f32 diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc index 3dd250c1d1d..67b291c8fcb 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc @@ -108,21 +108,15 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) { // In the IR generated for AMDGPUs, we do not seem to have the // the addrspace(1) attribute for the lines being checked by the following - // patterns still need to investigate why that is the case, and whether or not - // it is ok - auto expected_ir = is_built_with_rocm_ ? R"( + // patterns. + // need to investigate why that is the case, and whether or not it is ok + CompileAndVerifyIr(std::move(module), + R"( ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 -; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float* +; CHECK: %[[bitcast:.*]] = bitcast i8{{( addrspace\(1\))?}}* %[[alloc:.*]] to float{{( addrspace\(1\))?}}* ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 -; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]] - )" - : R"( -; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 -; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)* -; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 -; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]] - )"; - CompileAndVerifyIr(std::move(module), expected_ir, +; CHECK: getelementptr inbounds float, float{{( addrspace\(1\))?}}* %[[bitcast]], i64 %[[idx1]] + )", /*match_optimized_ir=*/true); } diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc index 3b19b50eece..aca3cca7b11 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc @@ -56,7 +56,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) { auto hlo_module = CreateNewVerifiedModule(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK-NOT: ld.global.f32 CHECK: ld.global.nc.f32 )"); @@ -86,7 +86,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) { auto hlo_module = CreateNewVerifiedModule(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK: { CHECK-NOT: ld.global.f32 CHECK: ld.global.nc.f32 @@ -143,7 +143,7 @@ TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) { std::unique_ptr computation = builder.Build(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK-LABEL: .entry sin CHECK: { CHECK-NOT: ld.global.nc.f32