Enable denormal-fp-math-f32 mode for f32 kernels.

This is necessary to be compatible with Tensorflow which uses the flag -fcuda-flush-denormals-to-zero when compiling cuda code with clang. PiperOrigin-RevId: 347999405 Change-Id: If303ef6d922ae239fe6ba4e0581be9f12c2edeb2
2020-12-17 04:44:40 -08:00 · 2020-12-17 04:44:40 -08:00 · 800a3b1fb4
commit 800a3b1fb4
parent 84de9ebadc
6 changed files with 31 additions and 16 deletions
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@ -245,7 +245,8 @@ Status AmendKernelLLVMIRWithStaticKnowledge(mlir::ModuleOp module) {
 Status GenerateDeviceCode(mlir::ModuleOp module,
                          llvm::StringRef gpu_binary_attr_name,
                          llvm::ArrayRef<std::string> architectures,
-                          bool generate_fatbin, bool print_ptx) {
+                          bool generate_fatbin, bool print_ptx,
+                          bool enable_ftz) {
  mlir::PassManager pm(module.getContext());
  applyTensorflowAndCLOptions(pm);

@ -253,7 +254,8 @@ Status GenerateDeviceCode(mlir::ModuleOp module,
  // Remove debug information to ensure we do not create debug PTX.
  kernel_pm.addPass(mlir::createStripDebugInfoPass());
  kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
-      gpu_binary_attr_name, architectures, generate_fatbin, print_ptx));
+      gpu_binary_attr_name, architectures, generate_fatbin, print_ptx,
+      enable_ftz));

  return failed(pm.run(module))
             ? InternalError("Generating device code failed.")
@ -281,7 +283,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
    llvm::ArrayRef<std::string> architectures,
    llvm::ArrayRef<uint32_t> tile_sizes,
    llvm::ArrayRef<uint32_t> unroll_factors, bool embed_memref_prints,
-    bool generate_fatbin, bool print_ptx) {
+    bool generate_fatbin, bool print_ptx, bool enable_ftz) {
  auto& registry = context.getDialectRegistry();
  mlir::RegisterAllTensorFlowDialects(registry);
  registry.insert<mlir::chlo::HloClientDialect, mlir::mhlo::MhloDialect>();
@ -302,7 +304,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
  TF_RETURN_IF_ERROR(AmendKernelLLVMIRWithStaticKnowledge(module.get()));
  TF_RETURN_IF_ERROR(GenerateDeviceCode(module.get(), kGpuBinaryAttrName,
                                        architectures, generate_fatbin,
-                                        print_ptx));
+                                        print_ptx, enable_ftz));
  TF_RETURN_IF_ERROR(LowerHostSideToFinalForm(module.get()));
  return module;
 }
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@ -40,7 +40,7 @@ xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
    llvm::ArrayRef<uint32_t> unroll_factors = {},
    bool embed_memref_prints = false, bool generate_fatbin = true,
-    bool print_ptx = false);
+    bool print_ptx = false, bool enable_ftz = false);

 // Extracts gpu_binary from the converted module.
 xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@ -106,7 +106,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
                llvm::ArrayRef<std::string> architectures,
                llvm::ArrayRef<uint32_t> tile_sizes,
                llvm::ArrayRef<uint32_t> unroll_factors,
-                bool embed_memref_prints, bool print_ptx) {
+                bool embed_memref_prints, bool print_ptx, bool enable_ftz) {
  // Read TF code.
  std::string tf_code;
  TF_RETURN_IF_ERROR(
@ -117,8 +117,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
      mlir::OwningModuleRef module,
      GenerateKernelForTfCode(context, tf_code, architectures, tile_sizes,
                              unroll_factors, embed_memref_prints,
-                              /*generate_fatbin=*/true,
-                              /*print_ptx=*/print_ptx));
+                              /*generate_fatbin=*/true, print_ptx, enable_ftz));
  // Get binary.
  TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));

@ -147,6 +146,11 @@ int main(int argc, char** argv) {
      "print-ptx",
      llvm::cl::desc("Print generated PTX code per target architecture."),
      llvm::cl::init(false));
+  llvm::cl::opt<bool> enable_ftz(
+      "enable_ftz",
+      llvm::cl::desc(
+          "Enable the denormal flush to zero mode when generating code."),
+      llvm::cl::init(false));
  llvm::cl::list<std::string> architectures(
      "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
      llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
@ -166,7 +170,7 @@ int main(int argc, char** argv) {

  auto status = tensorflow::kernel_gen::Run(
      input_file, output_file, architectures, tile_sizes, unroll_factors,
-      embed_memref_prints, print_ptx);
+      embed_memref_prints, print_ptx, enable_ftz);
  if (!status.ok()) {
    LOG(ERROR) << status;
    return 1;
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@ -53,13 +53,14 @@ class GpuKernelToBlobPass
 public:
  GpuKernelToBlobPass(mlir::StringRef blob_annotation,
                      llvm::ArrayRef<std::string> architectures,
-                      bool generate_fatbin, bool print_ptx) {
+                      bool generate_fatbin, bool print_ptx, bool enable_ftz) {
    if (!blob_annotation.empty()) {
      blob_annotation_ = blob_annotation.str();
    }
    architectures_ = architectures;
    generate_fatbin_ = generate_fatbin;
    print_ptx_ = print_ptx;
+    enable_ftz_ = enable_ftz;
  }

  void runOnOperation() override {
@ -99,7 +100,9 @@ class GpuKernelToBlobPass
    llvmModule->setModuleIdentifier("acme");

    xla::HloModuleConfig config;
-    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+    xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+    options.set_xla_gpu_ftz(enable_ftz_);
+    config.set_debug_options(options);

    using AmdGpuHsaco = std::vector<tensorflow::uint8>;
    std::vector<tensorflow::se::HsacoImage> images;
@ -148,7 +151,9 @@ class GpuKernelToBlobPass
    llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);

    xla::HloModuleConfig config;
-    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+    xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+    options.set_xla_gpu_ftz(enable_ftz_);
+    config.set_debug_options(options);

    auto enable_fusion = [](llvm::TargetMachine* target) {
      target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
@ -241,15 +246,16 @@ class GpuKernelToBlobPass
    return InternalError(
        "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
  }
+  bool enable_ftz_;
 };

 }  // namespace

 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
    mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
-    bool generate_fatbin, bool print_ptx) {
-  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
-                                               generate_fatbin, print_ptx);
+    bool generate_fatbin, bool print_ptx, bool enable_ftz) {
+  return std::make_unique<GpuKernelToBlobPass>(
+      blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
 }

 }  // namespace transforms
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@ -69,7 +69,7 @@ std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
    mlir::StringRef blob_annotation = {},
    ArrayRef<std::string> architectures = {}, bool generate_fatbin = true,
-    bool print_ptx = false);
+    bool print_ptx = false, bool enable_ftz = false);

 // Pass to propagate tensorflow runtime ABI knowledge across kernel boundaries.
 std::unique_ptr<FunctionPass> CreatePropagateTfAbiKnowledgeToKernels();
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@ -111,6 +111,7 @@ def _gen_kernel_fatbin_impl(ctx):
            "--arch=%s" % arch_flag,
            "--input=%s" % ctx.file.mlir_op.path,
            "--output=%s" % gpu_bin.path,
+            "--enable_ftz=%s" % (ctx.attr.data_type == "f32"),
        ],
        mnemonic = "compile",
    )
@ -131,6 +132,7 @@ def _gen_kernel_fatbin_impl(ctx):
 _gen_kernel_fatbin_rule = rule(
    attrs = {
        "mlir_op": attr.label(mandatory = True, allow_single_file = True),
+        "data_type": attr.string(mandatory = True),
        "tile_size": attr.string(mandatory = True),
        "unroll_factors": attr.string(),
        "gpu_archs": attr.string_list(mandatory = True),
@ -174,6 +176,7 @@ def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None,
            _gen_kernel_fatbin_rule(
                name = "{name}_{type}_kernel_generator".format(name = name, type = type),
                mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
+                data_type = type,
                gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
                tile_size = tile_size,
                unroll_factors = unroll_factors,