[XLA:CPU] Remove the global/module-level fast math flags

These are deprecated in favor of instruction-level fast math, and most of LLVM's backend code was updated to use those instead. Not having them gives us more fine-grained control of fast math flags without loss of performance. Disabling UnsafeFPMath has the side effect of requiring __truncdfhf2 for double->half conversions, so provide that. Also always allow FMA formation, while it's not IEEE754 compliant it never decreases accuracy. PiperOrigin-RevId: 281801638 Change-Id: I2d96220fefebad4d11b1dab8f75b06ccb88a05bf
2019-11-21 11:58:53 -08:00 · 2019-11-21 11:58:53 -08:00 · d04bfee679
commit d04bfee679
parent 1703690e1e
5 changed files with 13 additions and 28 deletions
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -409,20 +409,8 @@ auto memory_alignment = [](LogicalBuffer::Color) { return kMemoryAlignment; };
 llvm::TargetOptions CompilerTargetOptions(
    const HloModuleConfig& module_config) {
  llvm::TargetOptions target_options;
-  // In LLVM backend flags, UnsafeFPMath does not explicitly imply NoInfs, etc.
+  // Always allow FMA fusion. This increases precision instead of decreasing it.
-  if (module_config.debug_options().xla_cpu_enable_fast_math()) {
+  target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
    target_options.UnsafeFPMath = true;
    target_options.NoInfsFPMath =
        !module_config.debug_options().xla_cpu_fast_math_honor_infs();
    target_options.NoNaNsFPMath =
        !module_config.debug_options().xla_cpu_fast_math_honor_nans();
    target_options.NoSignedZerosFPMath = true;
  } else {
    target_options.UnsafeFPMath = false;
    target_options.NoInfsFPMath = false;
    target_options.NoNaNsFPMath = false;
    target_options.NoSignedZerosFPMath = false;
  }
  return target_options;
 }
--- a/tensorflow/compiler/xla/service/cpu/runtime_fp16.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fp16.cc
@ -131,3 +131,9 @@ float TF_ATTRIBUTE_WEAK __gnu_h2f_ieee(uint16 h) {
  o.set_uint(o.as_uint() | (h & 0x8000) << 16);  // sign bit
  return o.as_float();
 }
 uint16 TF_ATTRIBUTE_WEAK __truncdfhf2(double d) {
  // This does a double rounding step, but it's precise enough for our use
  // cases.
  return __gnu_f2h_ieee(static_cast<float>(d));
 }
--- a/tensorflow/compiler/xla/service/cpu/runtime_fp16.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fp16.h
@ -24,4 +24,7 @@ extern "C" tensorflow::uint16 __gnu_f2h_ieee(float);
 // Converts an F16 value to a F32.
 extern "C" float __gnu_h2f_ieee(tensorflow::uint16);
 // Converts an F64 value to a F16.
 extern "C" tensorflow::uint16 __truncdfhf2(double);
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FP16_H_
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@ -250,6 +250,8 @@ bool RegisterKnownJITSymbols() {
                     "Host");
  registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee),
                     "Host");
  registry->Register("__truncdfhf2", reinterpret_cast<void*>(__truncdfhf2),
                     "Host");
 #undef REGISTER_CPU_RUNTIME_SYMBOL
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@ -607,20 +607,6 @@ llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
  // created by the JIT compiled code.
  function->setHasUWTable();
  if (module_config.debug_options().xla_cpu_enable_fast_math()) {
    function->addFnAttr("unsafe-fp-math", "true");
    function->addFnAttr("no-signed-zeros-fp-math", "true");
    if (!module_config.debug_options().xla_cpu_fast_math_honor_nans()) {
      function->addFnAttr("no-nans-fp-math", "true");
    }
    if (!module_config.debug_options().xla_cpu_fast_math_honor_infs()) {
      function->addFnAttr("no-infs-fp-math", "true");
    }
    if (module_config.debug_options().xla_cpu_fast_math_honor_division()) {
      function->addFnAttr("reciprocal-estimates", "none");
    }
  }
  // Add the optize attribute to the function if optimizing for size. This
  // controls internal behavior of some optimization passes (e.g. loop
  // unrolling).