diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc index ce62fe205ab..e9af2336922 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc @@ -46,7 +46,7 @@ GpuCodegenTest::CreateNewVerifiedModuleWithFTZ(bool ftz) { ShapeUtil::ByteSizeOfElements); } -void GpuCodegenTest::CompileAndVerifyPtx( +void GpuCodegenTest::CompileAndOptionallyVerifyPtx( std::unique_ptr hlo_module, absl::string_view pattern) { std::unique_ptr executable = std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie()); @@ -55,11 +55,11 @@ void GpuCodegenTest::CompileAndVerifyPtx( // On the ROCM platform the "ptx" string is not populated for the compiled // executable, and hence the "ptx_str" will be empty. So disabling the // pattern check on the ROCm platform -#if !defined(TENSORFLOW_USE_ROCM) - StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); - ASSERT_TRUE(filecheck_result.ok()); - EXPECT_TRUE(filecheck_result.ValueOrDie()); -#endif + if (!is_built_with_rocm_) { + StatusOr filecheck_result = RunFileCheck(ptx_str, pattern); + ASSERT_TRUE(filecheck_result.ok()); + EXPECT_TRUE(filecheck_result.ValueOrDie()); + } } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h index 5f5b21150c1..c187e90301d 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h @@ -39,8 +39,11 @@ class GpuCodegenTest : public LlvmIrGenTestBase { // Compiles the given HLO module to PTX and verifies the PTX matches the given // FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html). - void CompileAndVerifyPtx(std::unique_ptr hlo_module, - absl::string_view pattern); + // The "VerifyPtx" part only happens on the CUDA platform, + // and hence the "Optionally" in function name. + // For ROCm platform this routine will only do the "Compile" part. + void CompileAndOptionallyVerifyPtx( + std::unique_ptr hlo_module, absl::string_view pattern); bool is_built_with_rocm_; }; diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc index 1e95119d7ae..282f7b24a31 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc @@ -76,25 +76,15 @@ class GpuFtzDisabledTest : public GpuFtzTest { }; // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise. -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzEnabledTest, MultiplyFtz) { - CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.f32 CHECK: mul.rn.ftz.f32 CHECK-NOT: mul.rn.f32 )"); } -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzDisabledTest, MultiplyFtz) { - CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( + CompileAndOptionallyVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"( CHECK-NOT: mul.rn.ftz.f32 CHECK: mul.rn.f32 CHECK-NOT: mul.rn.ftz.f32 @@ -106,13 +96,8 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) { // calls to ex2.approx. When ftz is on, we get two calls to the ftz version; // when ftz is off, we get one call to the ftz version and one call to the // regular version. -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzEnabledTest, ExpFtz) { - CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 CHECK: ex2.approx.ftz.f32 CHECK-NOT: ex2.approx.f32 @@ -122,13 +107,8 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) { )"); } -// -// On the ROCM platform the "ptx" string is not populated for the compiled -// executable, and hence the call to CompileAdnVerifyPtx does not do the -// "VerifyPtx" part, it merely compiles the executable -// TEST_F(GpuFtzDisabledTest, ExpFtz) { - CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( + CompileAndOptionallyVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"( CHECK-NOT: ex2.approx.f32 CHECK-DAG: ex2.approx.ftz.f32 CHECK-DAG: ex2.approx.f32 diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc index 3dd250c1d1d..67b291c8fcb 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc @@ -108,21 +108,15 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) { // In the IR generated for AMDGPUs, we do not seem to have the // the addrspace(1) attribute for the lines being checked by the following - // patterns still need to investigate why that is the case, and whether or not - // it is ok - auto expected_ir = is_built_with_rocm_ ? R"( + // patterns. + // need to investigate why that is the case, and whether or not it is ok + CompileAndVerifyIr(std::move(module), + R"( ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 -; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float* +; CHECK: %[[bitcast:.*]] = bitcast i8{{( addrspace\(1\))?}}* %[[alloc:.*]] to float{{( addrspace\(1\))?}}* ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 -; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]] - )" - : R"( -; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14 -; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)* -; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64 -; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]] - )"; - CompileAndVerifyIr(std::move(module), expected_ir, +; CHECK: getelementptr inbounds float, float{{( addrspace\(1\))?}}* %[[bitcast]], i64 %[[idx1]] + )", /*match_optimized_ir=*/true); } diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc index 3b19b50eece..aca3cca7b11 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc @@ -56,7 +56,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) { auto hlo_module = CreateNewVerifiedModule(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK-NOT: ld.global.f32 CHECK: ld.global.nc.f32 )"); @@ -86,7 +86,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) { auto hlo_module = CreateNewVerifiedModule(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK: { CHECK-NOT: ld.global.f32 CHECK: ld.global.nc.f32 @@ -143,7 +143,7 @@ TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) { std::unique_ptr computation = builder.Build(); hlo_module->AddEntryComputation(std::move(computation)); - CompileAndVerifyPtx(std::move(hlo_module), R"( + CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"( CHECK-LABEL: .entry sin CHECK: { CHECK-NOT: ld.global.nc.f32