From 800a3b1fb40f0b2fc83e01ca0aa528786cc2dc42 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 17 Dec 2020 04:44:40 -0800
Subject: [PATCH] Enable denormal-fp-math-f32 mode for f32 kernels.

This is necessary to be compatible with Tensorflow which uses
the flag -fcuda-flush-denormals-to-zero when compiling cuda
code with clang.

PiperOrigin-RevId: 347999405
Change-Id: If303ef6d922ae239fe6ba4e0581be9f12c2edeb2
---
 .../mlir/tools/kernel_gen/kernel_creator.cc    | 10 ++++++----
 .../mlir/tools/kernel_gen/kernel_creator.h     |  2 +-
 .../mlir/tools/kernel_gen/tf_to_kernel.cc      | 12 ++++++++----
 .../transforms/gpu_kernel_to_blob_pass.cc      | 18 ++++++++++++------
 .../mlir/tools/kernel_gen/transforms/passes.h  |  2 +-
 .../core/kernels/mlir_generated/build_defs.bzl |  3 +++
 6 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 9985d752845..9e8b62dcffb 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -245,7 +245,8 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module) {
 Status GenerateDeviceCode(mlir::ModuleOp module,
                           llvm::StringRef gpu_binary_attr_name,
                           llvm::ArrayRef<std::string> architectures,
-                          bool generate_fatbin, bool print_ptx) {
+                          bool generate_fatbin, bool print_ptx,
+                          bool enable_ftz) {
   mlir::PassManager pm(module.getContext());
   applyTensorflowAndCLOptions(pm);
 
@@ -253,7 +254,8 @@ Status GenerateDeviceCode(mlir::ModuleOp module,
   // Remove debug information to ensure we do not create debug PTX.
   kernel_pm.addPass(mlir::createStripDebugInfoPass());
   kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
-      gpu_binary_attr_name, architectures, generate_fatbin, print_ptx));
+      gpu_binary_attr_name, architectures, generate_fatbin, print_ptx,
+      enable_ftz));
 
   return failed(pm.run(module))
              ? InternalError("Generating device code failed.")
@@ -281,7 +283,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
     llvm::ArrayRef<std::string> architectures,
     llvm::ArrayRef<uint32_t> tile_sizes,
     llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
-    bool generate_fatbin, bool print_ptx) {
+    bool generate_fatbin, bool print_ptx, bool enable_ftz) {
   auto& registry = context.getDialectRegistry();
   mlir::RegisterAllTensorFlowDialects(registry);
   registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
@@ -302,7 +304,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
   TF_RETURN_IF_ERROR(AmendKernelLLVMIRWithStaticKnowledge(module.get()));
   TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
                                         architectures, generate_fatbin,
-                                        print_ptx));
+                                        print_ptx, enable_ftz));
   TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
   return module;
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index ac8ce845713..8216656aa85 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -40,7 +40,7 @@ xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
     llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
     llvm::ArrayRef<uint32_t> unroll_factors = {},
     bool embed_memref_prints = false, bool generate_fatbin = true,
-    bool print_ptx = false);
+    bool print_ptx = false, bool enable_ftz = false);
 
 // Extracts gpu_binary from the converted module.
 xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index e0ad2349e89..823e143437a 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -106,7 +106,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
                 llvm::ArrayRef<std::string> architectures,
                 llvm::ArrayRef<uint32_t> tile_sizes,
                 llvm::ArrayRef<uint32_t> unroll_factors,
-                bool embed_memref_prints, bool print_ptx) {
+                bool embed_memref_prints, bool print_ptx, bool enable_ftz) {
   // Read TF code.
   std::string tf_code;
   TF_RETURN_IF_ERROR(
@@ -117,8 +117,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
       mlir::OwningModuleRef module,
       GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
                               unroll_factors, embed_memref_prints,
-                              /*generate_fatbin=*/true,
-                              /*print_ptx=*/print_ptx));
+                              /*generate_fatbin=*/true, print_ptx, enable_ftz));
   // Get binary.
   TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
 
@@ -147,6 +146,11 @@ int main(int argc, char** argv) {
       "print-ptx",
       llvm::cl::desc("Print generated PTX code per target architecture."),
       llvm::cl::init(false));
+  llvm::cl::opt<bool> enable_ftz(
+      "enable_ftz",
+      llvm::cl::desc(
+          "Enable the denormal flush to zero mode when generating code."),
+      llvm::cl::init(false));
   llvm::cl::list<std::string> architectures(
       "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
       llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
@@ -166,7 +170,7 @@ int main(int argc, char** argv) {
 
   auto status = tensorflow::kernel_gen::Run(
       input_file, output_file, architectures, tile_sizes, unroll_factors,
-      embed_memref_prints, print_ptx);
+      embed_memref_prints, print_ptx, enable_ftz);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index adeb14eef14..df6f55d9ef5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -53,13 +53,14 @@ class GpuKernelToBlobPass
  public:
   GpuKernelToBlobPass(mlir::StringRef blob_annotation,
                       llvm::ArrayRef<std::string> architectures,
-                      bool generate_fatbin, bool print_ptx) {
+                      bool generate_fatbin, bool print_ptx, bool enable_ftz) {
     if (!blob_annotation.empty()) {
       blob_annotation_ = blob_annotation.str();
     }
     architectures_ = architectures;
     generate_fatbin_ = generate_fatbin;
     print_ptx_ = print_ptx;
+    enable_ftz_ = enable_ftz;
   }
 
   void runOnOperation() override {
@@ -99,7 +100,9 @@ class GpuKernelToBlobPass
     llvmModule->setModuleIdentifier("acme");
 
     xla::HloModuleConfig config;
-    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+    xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+    options.set_xla_gpu_ftz(enable_ftz_);
+    config.set_debug_options(options);
 
     using AmdGpuHsaco = std::vector<tensorflow::uint8>;
     std::vector<tensorflow::se::HsacoImage> images;
@@ -148,7 +151,9 @@ class GpuKernelToBlobPass
     llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
 
     xla::HloModuleConfig config;
-    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+    xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+    options.set_xla_gpu_ftz(enable_ftz_);
+    config.set_debug_options(options);
 
     auto enable_fusion = [](llvm::TargetMachine* target) {
       target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
@@ -241,15 +246,16 @@ class GpuKernelToBlobPass
     return InternalError(
         "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
   }
+  bool enable_ftz_;
 };
 
 }  // namespace
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
     mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
-    bool generate_fatbin, bool print_ptx) {
-  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
-                                               generate_fatbin, print_ptx);
+    bool generate_fatbin, bool print_ptx, bool enable_ftz) {
+  return std::make_unique<GpuKernelToBlobPass>(
+      blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 98d831479f8..a29f4121e1c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -69,7 +69,7 @@ std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
     mlir::StringRef blob_annotation = {},
     ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
-    bool print_ptx = false);
+    bool print_ptx = false, bool enable_ftz = false);
 
 // Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
 std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 0a232f5f41e..df7fcc371a7 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -111,6 +111,7 @@ def _gen_kernel_fatbin_impl(ctx):
             "--arch=%s" % arch_flag,
             "--input=%s" % ctx.file.mlir_op.path,
             "--output=%s" % gpu_bin.path,
+            "--enable_ftz=%s" % (ctx.attr.data_type == "f32"),
         ],
         mnemonic = "compile",
     )
@@ -131,6 +132,7 @@ def _gen_kernel_fatbin_impl(ctx):
 _gen_kernel_fatbin_rule = rule(
     attrs = {
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
+        "data_type": attr.string(mandatory = True),
         "tile_size": attr.string(mandatory = True),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
@@ -174,6 +176,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None,
             _gen_kernel_fatbin_rule(
                 name = "{name}_{type}_kernel_generator".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
+                data_type = type,
                 gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
                 tile_size = tile_size,
                 unroll_factors = unroll_factors,