From 800a3b1fb40f0b2fc83e01ca0aa528786cc2dc42 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 17 Dec 2020 04:44:40 -0800 Subject: [PATCH] Enable denormal-fp-math-f32 mode for f32 kernels. This is necessary to be compatible with Tensorflow which uses the flag -fcuda-flush-denormals-to-zero when compiling cuda code with clang. PiperOrigin-RevId: 347999405 Change-Id: If303ef6d922ae239fe6ba4e0581be9f12c2edeb2 --- .../mlir/tools/kernel_gen/kernel_creator.cc | 10 ++++++---- .../mlir/tools/kernel_gen/kernel_creator.h | 2 +- .../mlir/tools/kernel_gen/tf_to_kernel.cc | 12 ++++++++---- .../transforms/gpu_kernel_to_blob_pass.cc | 18 ++++++++++++------ .../mlir/tools/kernel_gen/transforms/passes.h | 2 +- .../core/kernels/mlir_generated/build_defs.bzl | 3 +++ 6 files changed, 31 insertions(+), 16 deletions(-) diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc index 9985d752845..9e8b62dcffb 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc @@ -245,7 +245,8 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module) { Status GenerateDeviceCode(mlir::ModuleOp module, llvm::StringRef gpu_binary_attr_name, llvm::ArrayRef architectures, - bool generate_fatbin, bool print_ptx) { + bool generate_fatbin, bool print_ptx, + bool enable_ftz) { mlir::PassManager pm(module.getContext()); applyTensorflowAndCLOptions(pm); @@ -253,7 +254,8 @@ Status GenerateDeviceCode(mlir::ModuleOp module, // Remove debug information to ensure we do not create debug PTX. kernel_pm.addPass(mlir::createStripDebugInfoPass()); kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass( - gpu_binary_attr_name, architectures, generate_fatbin, print_ptx)); + gpu_binary_attr_name, architectures, generate_fatbin, print_ptx, + enable_ftz)); return failed(pm.run(module)) ? InternalError("Generating device code failed.") @@ -281,7 +283,7 @@ StatusOr GenerateKernelForTfCode( llvm::ArrayRef architectures, llvm::ArrayRef tile_sizes, llvm::ArrayRef unroll_factors, bool embed_memref_prints, - bool generate_fatbin, bool print_ptx) { + bool generate_fatbin, bool print_ptx, bool enable_ftz) { auto& registry = context.getDialectRegistry(); mlir::RegisterAllTensorFlowDialects(registry); registry.insert(); @@ -302,7 +304,7 @@ StatusOr GenerateKernelForTfCode( TF_RETURN_IF_ERROR(AmendKernelLLVMIRWithStaticKnowledge(module.get())); TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName, architectures, generate_fatbin, - print_ptx)); + print_ptx, enable_ftz)); TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get())); return module; } diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h index ac8ce845713..8216656aa85 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h +++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h @@ -40,7 +40,7 @@ xla::StatusOr GenerateKernelForTfCode( llvm::ArrayRef tile_sizes = {16, 64}, llvm::ArrayRef unroll_factors = {}, bool embed_memref_prints = false, bool generate_fatbin = true, - bool print_ptx = false); + bool print_ptx = false, bool enable_ftz = false); // Extracts gpu_binary from the converted module. xla::StatusOr ExtractGpuBinary(mlir::ModuleOp module); diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc index e0ad2349e89..823e143437a 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc @@ -106,7 +106,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file, llvm::ArrayRef architectures, llvm::ArrayRef tile_sizes, llvm::ArrayRef unroll_factors, - bool embed_memref_prints, bool print_ptx) { + bool embed_memref_prints, bool print_ptx, bool enable_ftz) { // Read TF code. std::string tf_code; TF_RETURN_IF_ERROR( @@ -117,8 +117,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file, mlir::OwningModuleRef module, GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes, unroll_factors, embed_memref_prints, - /*generate_fatbin=*/true, - /*print_ptx=*/print_ptx)); + /*generate_fatbin=*/true, print_ptx, enable_ftz)); // Get binary. TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module)); @@ -147,6 +146,11 @@ int main(int argc, char** argv) { "print-ptx", llvm::cl::desc("Print generated PTX code per target architecture."), llvm::cl::init(false)); + llvm::cl::opt enable_ftz( + "enable_ftz", + llvm::cl::desc( + "Enable the denormal flush to zero mode when generating code."), + llvm::cl::init(false)); llvm::cl::list architectures( "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"), llvm::cl::OneOrMore, llvm::cl::CommaSeparated); @@ -166,7 +170,7 @@ int main(int argc, char** argv) { auto status = tensorflow::kernel_gen::Run( input_file, output_file, architectures, tile_sizes, unroll_factors, - embed_memref_prints, print_ptx); + embed_memref_prints, print_ptx, enable_ftz); if (!status.ok()) { LOG(ERROR) << status; return 1; diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc index adeb14eef14..df6f55d9ef5 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc @@ -53,13 +53,14 @@ class GpuKernelToBlobPass public: GpuKernelToBlobPass(mlir::StringRef blob_annotation, llvm::ArrayRef architectures, - bool generate_fatbin, bool print_ptx) { + bool generate_fatbin, bool print_ptx, bool enable_ftz) { if (!blob_annotation.empty()) { blob_annotation_ = blob_annotation.str(); } architectures_ = architectures; generate_fatbin_ = generate_fatbin; print_ptx_ = print_ptx; + enable_ftz_ = enable_ftz; } void runOnOperation() override { @@ -99,7 +100,9 @@ class GpuKernelToBlobPass llvmModule->setModuleIdentifier("acme"); xla::HloModuleConfig config; - config.set_debug_options(xla::GetDebugOptionsFromFlags()); + xla::DebugOptions options = xla::GetDebugOptionsFromFlags(); + options.set_xla_gpu_ftz(enable_ftz_); + config.set_debug_options(options); using AmdGpuHsaco = std::vector; std::vector images; @@ -148,7 +151,9 @@ class GpuKernelToBlobPass llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout); xla::HloModuleConfig config; - config.set_debug_options(xla::GetDebugOptionsFromFlags()); + xla::DebugOptions options = xla::GetDebugOptionsFromFlags(); + options.set_xla_gpu_ftz(enable_ftz_); + config.set_debug_options(options); auto enable_fusion = [](llvm::TargetMachine* target) { target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast; @@ -241,15 +246,16 @@ class GpuKernelToBlobPass return InternalError( "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice"); } + bool enable_ftz_; }; } // namespace std::unique_ptr> CreateGpuKernelToBlobPass( mlir::StringRef blob_annotation, ArrayRef architectures, - bool generate_fatbin, bool print_ptx) { - return std::make_unique(blob_annotation, architectures, - generate_fatbin, print_ptx); + bool generate_fatbin, bool print_ptx, bool enable_ftz) { + return std::make_unique( + blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz); } } // namespace transforms diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h index 98d831479f8..a29f4121e1c 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h +++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h @@ -69,7 +69,7 @@ std::unique_ptr CreateParallelLoopsToSequential(); std::unique_ptr> CreateGpuKernelToBlobPass( mlir::StringRef blob_annotation = {}, ArrayRef architectures = {}, bool generate_fatbin = true, - bool print_ptx = false); + bool print_ptx = false, bool enable_ftz = false); // Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries. std::unique_ptr CreatePropagateTfAbiKnowledgeToKernels(); diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl index 0a232f5f41e..df7fcc371a7 100644 --- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl +++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl @@ -111,6 +111,7 @@ def _gen_kernel_fatbin_impl(ctx): "--arch=%s" % arch_flag, "--input=%s" % ctx.file.mlir_op.path, "--output=%s" % gpu_bin.path, + "--enable_ftz=%s" % (ctx.attr.data_type == "f32"), ], mnemonic = "compile", ) @@ -131,6 +132,7 @@ def _gen_kernel_fatbin_impl(ctx): _gen_kernel_fatbin_rule = rule( attrs = { "mlir_op": attr.label(mandatory = True, allow_single_file = True), + "data_type": attr.string(mandatory = True), "tile_size": attr.string(mandatory = True), "unroll_factors": attr.string(), "gpu_archs": attr.string_list(mandatory = True), @@ -174,6 +176,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, _gen_kernel_fatbin_rule( name = "{name}_{type}_kernel_generator".format(name = name, type = type), mlir_op = "{name}_{type}.mlir".format(name = name, type = type), + data_type = type, gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(), tile_size = tile_size, unroll_factors = unroll_factors,