Enable denormal-fp-math-f32 mode for f32 kernels.

This is necessary to be compatible with Tensorflow which uses
the flag -fcuda-flush-denormals-to-zero when compiling cuda
code with clang.

PiperOrigin-RevId: 347999405
Change-Id: If303ef6d922ae239fe6ba4e0581be9f12c2edeb2
This commit is contained in:
Adrian Kuegel 2020-12-17 04:44:40 -08:00 committed by TensorFlower Gardener
parent 84de9ebadc
commit 800a3b1fb4
6 changed files with 31 additions and 16 deletions

View File

@ -245,7 +245,8 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module) {
Status GenerateDeviceCode(mlir::ModuleOp module,
llvm::StringRef gpu_binary_attr_name,
llvm::ArrayRef<std::string> architectures,
bool generate_fatbin, bool print_ptx) {
bool generate_fatbin, bool print_ptx,
bool enable_ftz) {
mlir::PassManager pm(module.getContext());
applyTensorflowAndCLOptions(pm);
@ -253,7 +254,8 @@ Status GenerateDeviceCode(mlir::ModuleOp module,
// Remove debug information to ensure we do not create debug PTX.
kernel_pm.addPass(mlir::createStripDebugInfoPass());
kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
gpu_binary_attr_name, architectures, generate_fatbin, print_ptx));
gpu_binary_attr_name, architectures, generate_fatbin, print_ptx,
enable_ftz));
return failed(pm.run(module))
? InternalError("Generating device code failed.")
@ -281,7 +283,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
llvm::ArrayRef<std::string> architectures,
llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
bool generate_fatbin, bool print_ptx) {
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
auto& registry = context.getDialectRegistry();
mlir::RegisterAllTensorFlowDialects(registry);
registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
@ -302,7 +304,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
TF_RETURN_IF_ERROR(AmendKernelLLVMIRWithStaticKnowledge(module.get()));
TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
architectures, generate_fatbin,
print_ptx));
print_ptx, enable_ftz));
TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
return module;
}

View File

@ -40,7 +40,7 @@ xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
llvm::ArrayRef<uint32_t> unroll_factors = {},
bool embed_memref_prints = false, bool generate_fatbin = true,
bool print_ptx = false);
bool print_ptx = false, bool enable_ftz = false);
// Extracts gpu_binary from the converted module.
xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);

View File

@ -106,7 +106,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
llvm::ArrayRef<std::string> architectures,
llvm::ArrayRef<uint32_t> tile_sizes,
llvm::ArrayRef<uint32_t> unroll_factors,
bool embed_memref_prints, bool print_ptx) {
bool embed_memref_prints, bool print_ptx, bool enable_ftz) {
// Read TF code.
std::string tf_code;
TF_RETURN_IF_ERROR(
@ -117,8 +117,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
mlir::OwningModuleRef module,
GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
unroll_factors, embed_memref_prints,
/*generate_fatbin=*/true,
/*print_ptx=*/print_ptx));
/*generate_fatbin=*/true, print_ptx, enable_ftz));
// Get binary.
TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
@ -147,6 +146,11 @@ int main(int argc, char** argv) {
"print-ptx",
llvm::cl::desc("Print generated PTX code per target architecture."),
llvm::cl::init(false));
llvm::cl::opt<bool> enable_ftz(
"enable_ftz",
llvm::cl::desc(
"Enable the denormal flush to zero mode when generating code."),
llvm::cl::init(false));
llvm::cl::list<std::string> architectures(
"arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
@ -166,7 +170,7 @@ int main(int argc, char** argv) {
auto status = tensorflow::kernel_gen::Run(
input_file, output_file, architectures, tile_sizes, unroll_factors,
embed_memref_prints, print_ptx);
embed_memref_prints, print_ptx, enable_ftz);
if (!status.ok()) {
LOG(ERROR) << status;
return 1;

View File

@ -53,13 +53,14 @@ class GpuKernelToBlobPass
public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation,
llvm::ArrayRef<std::string> architectures,
bool generate_fatbin, bool print_ptx) {
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
if (!blob_annotation.empty()) {
blob_annotation_ = blob_annotation.str();
}
architectures_ = architectures;
generate_fatbin_ = generate_fatbin;
print_ptx_ = print_ptx;
enable_ftz_ = enable_ftz;
}
void runOnOperation() override {
@ -99,7 +100,9 @@ class GpuKernelToBlobPass
llvmModule->setModuleIdentifier("acme");
xla::HloModuleConfig config;
config.set_debug_options(xla::GetDebugOptionsFromFlags());
xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
options.set_xla_gpu_ftz(enable_ftz_);
config.set_debug_options(options);
using AmdGpuHsaco = std::vector<tensorflow::uint8>;
std::vector<tensorflow::se::HsacoImage> images;
@ -148,7 +151,9 @@ class GpuKernelToBlobPass
llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
xla::HloModuleConfig config;
config.set_debug_options(xla::GetDebugOptionsFromFlags());
xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
options.set_xla_gpu_ftz(enable_ftz_);
config.set_debug_options(options);
auto enable_fusion = [](llvm::TargetMachine* target) {
target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
@ -241,15 +246,16 @@ class GpuKernelToBlobPass
return InternalError(
"Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
}
bool enable_ftz_;
};
} // namespace
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
bool generate_fatbin, bool print_ptx) {
return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
generate_fatbin, print_ptx);
bool generate_fatbin, bool print_ptx, bool enable_ftz) {
return std::make_unique<GpuKernelToBlobPass>(
blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
}
} // namespace transforms

View File

@ -69,7 +69,7 @@ std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
mlir::StringRef blob_annotation = {},
ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
bool print_ptx = false);
bool print_ptx = false, bool enable_ftz = false);
// Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();

View File

@ -111,6 +111,7 @@ def _gen_kernel_fatbin_impl(ctx):
"--arch=%s" % arch_flag,
"--input=%s" % ctx.file.mlir_op.path,
"--output=%s" % gpu_bin.path,
"--enable_ftz=%s" % (ctx.attr.data_type == "f32"),
],
mnemonic = "compile",
)
@ -131,6 +132,7 @@ def _gen_kernel_fatbin_impl(ctx):
_gen_kernel_fatbin_rule = rule(
attrs = {
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
"data_type": attr.string(mandatory = True),
"tile_size": attr.string(mandatory = True),
"unroll_factors": attr.string(),
"gpu_archs": attr.string_list(mandatory = True),
@ -174,6 +176,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None,
_gen_kernel_fatbin_rule(
name = "{name}_{type}_kernel_generator".format(name = name, type = type),
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
data_type = type,
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
tile_size = tile_size,
unroll_factors = unroll_factors,