[ROCm] adding ROCm specific versions of "expected" results for FileCheck
This commit is contained in:
parent
90afef10d5
commit
c14b6951de
@ -51,9 +51,15 @@ void GpuCodegenTest::CompileAndVerifyPtx(
|
||||
std::unique_ptr<Executable> executable =
|
||||
std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
|
||||
string ptx_str(static_cast<GpuExecutable*>(executable.get())->text());
|
||||
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the "ptx_str" will be empty. So disabling the
|
||||
// pattern check on the ROCm platform
|
||||
#if !defined(TENSORFLOW_USE_ROCM)
|
||||
StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
|
||||
ASSERT_TRUE(filecheck_result.ok());
|
||||
EXPECT_TRUE(filecheck_result.ValueOrDie());
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -27,6 +27,11 @@ namespace gpu {
|
||||
|
||||
// Tests that verify IR or PTX emitted by the GPU backend is as expected.
|
||||
class GpuCodegenTest : public LlvmIrGenTestBase {
|
||||
public:
|
||||
GpuCodegenTest()
|
||||
: is_built_with_rocm_(
|
||||
se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {}
|
||||
|
||||
protected:
|
||||
// Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
|
||||
// the ftz option.
|
||||
@ -36,6 +41,8 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
|
||||
// FileCheck pattern. (See http://llvm.org/docs/CommandGuide/FileCheck.html).
|
||||
void CompileAndVerifyPtx(std::unique_ptr<VerifiedHloModule> hlo_module,
|
||||
absl::string_view pattern);
|
||||
|
||||
bool is_built_with_rocm_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
@ -76,6 +76,11 @@ class GpuFtzDisabledTest : public GpuFtzTest {
|
||||
};
|
||||
|
||||
// Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
|
||||
CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
|
||||
CHECK-NOT: mul.rn.f32
|
||||
@ -83,6 +88,11 @@ TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
|
||||
CHECK-NOT: mul.rn.f32
|
||||
)");
|
||||
}
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
|
||||
CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
|
||||
CHECK-NOT: mul.rn.ftz.f32
|
||||
@ -96,6 +106,11 @@ TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
|
||||
// calls to ex2.approx. When ftz is on, we get two calls to the ftz version;
|
||||
// when ftz is off, we get one call to the ftz version and one call to the
|
||||
// regular version.
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuFtzEnabledTest, ExpFtz) {
|
||||
CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
|
||||
CHECK-NOT: ex2.approx.f32
|
||||
@ -107,6 +122,11 @@ TEST_F(GpuFtzEnabledTest, ExpFtz) {
|
||||
)");
|
||||
}
|
||||
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuFtzDisabledTest, ExpFtz) {
|
||||
CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
|
||||
CHECK-NOT: ex2.approx.f32
|
||||
|
@ -105,13 +105,24 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
|
||||
.ValueOrDie();
|
||||
|
||||
// Check the optimized IR reuses the linear index by calculating modulo 14.
|
||||
CompileAndVerifyIr(std::move(module),
|
||||
R"(
|
||||
|
||||
// In the IR generated for AMDGPUs, we do not seem to have the
|
||||
// the addrspace(1) attribute for the lines being checked by the following
|
||||
// patterns still need to investigate why that is the case, and whether or not
|
||||
// it is ok
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
|
||||
; CHECK: %[[bitcast:.*]] = bitcast i8* %[[alloc:.*]] to float*
|
||||
; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
|
||||
; CHECK: getelementptr inbounds float, float* %[[bitcast]], i64 %[[idx1]]
|
||||
)"
|
||||
: R"(
|
||||
; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
|
||||
; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
|
||||
; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
|
||||
; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
}
|
||||
|
||||
|
@ -63,12 +63,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithOnlyOneSlice) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: slice0
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: slice0
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/false);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
|
||||
@ -100,12 +105,17 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: slice2
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: slice2
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/false);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
|
||||
@ -142,12 +152,17 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: slice2
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: slice2
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/false);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0, 0}));
|
||||
|
@ -63,12 +63,19 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @copy
|
||||
; CHECK: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @copy
|
||||
; CHECK: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -90,12 +97,17 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @copy
|
||||
; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @copy
|
||||
; CHECK-NOT: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
}
|
||||
|
||||
@ -134,12 +146,17 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -169,12 +186,17 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -205,12 +227,17 @@ TEST_F(GpuKernelTilingTest,
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK-NOT: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
}
|
||||
|
||||
@ -233,12 +260,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK-NOT: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
}
|
||||
|
||||
@ -261,12 +293,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK-NOT: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK-NOT: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -297,12 +334,17 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK: call void @llvm.amdgcn.s.barrier()
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: call void @llvm.nvvm.barrier0()
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
|
||||
@ -329,14 +371,31 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-NOT: cmpxchg
|
||||
;
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK-NOT: atomicrmw fadd float
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
|
||||
@ -376,13 +435,25 @@ TEST_F(GpuKernelTilingTest,
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-NOT: cmpxchg
|
||||
;
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK-NOT: atomicrmw fadd float
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
|
||||
@ -424,8 +495,34 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
;
|
||||
; CHECK-NOT: cmpxchg
|
||||
;
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK: atomicrmw fadd float
|
||||
@ -433,7 +530,8 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK-NOT: atomicrmw fadd float
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
|
||||
@ -459,12 +557,20 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -491,12 +597,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @reduce
|
||||
; CHECK: call i32 @llvm.amdgcn.ds.bpermute
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @reduce
|
||||
; CHECK: call float @llvm.nvvm.shfl.sync.down.f32
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -524,12 +635,20 @@ TEST_F(GpuKernelTilingTest,
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @reduce
|
||||
; CHECK-LABEL: atomic_op_loop_body{{.*}}:
|
||||
; CHECK: %[[fadd:.*]] = fadd float %{{.*}}, %{{.*}}
|
||||
; CHECK: %[[bitcast:.*]] = bitcast float %[[fadd]] to i32
|
||||
; CHECK: %{{.*}} = cmpxchg i32* %{{.*}}, i32 %{{.*}}, i32 %[[bitcast]]
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @reduce
|
||||
; CHECK: atomicrmw fadd float
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
@ -570,12 +689,17 @@ TEST_F(GpuKernelTilingTest, ColumnReductionSmallTileSizeX) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @fusion
|
||||
; CHECK-NOT: reduce.0.loop_header
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @fusion
|
||||
; CHECK-NOT: reduce.0.loop_header
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
// Check that the kernel runs correctly.
|
||||
EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
|
||||
@ -601,12 +725,17 @@ TEST_F(GpuKernelTilingTest, RowReductionWithSmallDimensionNotTiled) {
|
||||
auto hlo_module =
|
||||
ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
|
||||
.ValueOrDie();
|
||||
CompileAndVerifyIr(std::move(hlo_module),
|
||||
R"(
|
||||
auto expected_ir = is_built_with_rocm_ ? R"(
|
||||
; CHECK-LABEL: define amdgpu_kernel void @reduce
|
||||
; CHECK-NOT: call i32 @llvm.amdgcn.ds.bpermute
|
||||
; CHECK: }
|
||||
)"
|
||||
: R"(
|
||||
; CHECK-LABEL: define void @reduce
|
||||
; CHECK-NOT: call float @llvm.nvvm.shfl.sync.down.f32
|
||||
; CHECK: }
|
||||
)",
|
||||
)";
|
||||
CompileAndVerifyIr(std::move(hlo_module), expected_ir,
|
||||
/*match_optimized_ir=*/true);
|
||||
|
||||
// Check that the kernel runs correctly.
|
||||
|
@ -38,6 +38,11 @@ class GpuLdgTest : public GpuCodegenTest {};
|
||||
|
||||
// Parameters are never overwritten, so parameter reads should get ld.global.nc
|
||||
// reads.
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuLdgTest, LdgForParamRead) {
|
||||
HloComputation::Builder builder(TestName());
|
||||
|
||||
@ -60,6 +65,11 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
|
||||
// Check that reading a buffer produced by a non-parameter HLO also results in
|
||||
// ld.global.nc, if that buffer isn't modified within the instruction that reads
|
||||
// it.
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuLdgTest, LdgForNonParamRead) {
|
||||
HloComputation::Builder builder(TestName());
|
||||
|
||||
@ -94,6 +104,11 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
|
||||
// It seems like a fair bet that we won't start fusing sin into the output of
|
||||
// reduce in the foreseeable future. But if that turns out to be wrong, I give
|
||||
// you, future reader, permission to delete this test.
|
||||
//
|
||||
// On the ROCM platform the "ptx" string is not populated for the compiled
|
||||
// executable, and hence the call to CompileAdnVerifyPtx does not do the
|
||||
// "VerifyPtx" part, it merely compiles the executable
|
||||
//
|
||||
TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
|
||||
auto hlo_module = CreateNewVerifiedModule();
|
||||
HloComputation::Builder builder(TestName());
|
||||
|
Loading…
x
Reference in New Issue
Block a user