From 1e135c54c52a71ae9267011f13fded21cc05fcc3 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Mon, 22 Feb 2021 11:16:22 -0800
Subject: [PATCH] [XLA:CPU] More accurate expm1 when x is small, take two

We approximate it with:
expm1(x) = tanh(x/2)*(exp(x)+1)

Additional care is taken to handle the case when x/2 underflows but x does not
by simply approximating the result with x itself.

Yet further care must be taken to handle the case when exp(x) would not be all that close to 1, in which case we simply use:
expm1(x) = exp(x)-1

The pseudo-code for this is roughly:
  if x/2 == 0:
    return x
  exp_x = exp(x)
  if |x| > .5:
    return exp_x - 1
  return tanh(x/2)*(exp_x+1)

The actual code sequence emitted preserves vectorization in the case where different lanes observe inputs where the magnitudes are entirely different.

This suffices to get us within a relative error of 4.76e-7 or about eight ULPs when
compared against libm.

PiperOrigin-RevId: 358861023
Change-Id: I4a51ec8e2a16a95b6cbaa2af3305ce3a16201c54
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 17 +++++++++
 .../compiler/xla/service/cpu/ir_emitter.h     |  2 ++
 .../xla/service/cpu/simple_orc_jit.cc         |  3 ++
 .../xla/service/elemental_ir_emitter.cc       | 35 +++++++++----------
 .../exhaustive_unary_test_f32_or_smaller.cc   | 12 ++++++-
 5 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 7827f1a39e4..7e9dbc34ec0 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2551,6 +2551,23 @@ llvm::Value* IrEmitter::EmitPrintf(absl::string_view fmt,
       call_args);
 }
 
+llvm::Value* IrEmitter::EmitFprintf(absl::string_view fmt,
+                                    absl::Span<llvm::Value* const> arguments) {
+  llvm::Type* ptr_ty = b_.getInt8Ty()->getPointerTo();
+  auto stderr_symbol =
+      b_.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal("stderr",
+                                                                       ptr_ty);
+  std::vector<llvm::Value*> call_args;
+  call_args.push_back(b_.CreateLoad(stderr_symbol));
+  call_args.push_back(b_.CreateGlobalStringPtr(llvm_ir::AsStringRef(fmt)));
+  absl::c_copy(arguments, std::back_inserter(call_args));
+  return b_.CreateCall(
+      b_.GetInsertBlock()->getParent()->getParent()->getOrInsertFunction(
+          "fprintf", llvm::FunctionType::get(b_.getInt32Ty(), {ptr_ty, ptr_ty},
+                                             /*isVarArg=*/true)),
+      call_args);
+}
+
 llvm::Value* IrEmitter::EmitCallToFunc(
     std::string func_name, const std::vector<llvm::Value*>& arguments,
     llvm::Type* return_type, bool does_not_throw, bool only_accesses_arg_memory,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index f7762dfc8bf..3dd54507ebc 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -418,6 +418,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Emits printing during the execution.
   llvm::Value* EmitPrintf(absl::string_view fmt,
                           absl::Span<llvm::Value* const> arguments);
+  llvm::Value* EmitFprintf(absl::string_view fmt,
+                           absl::Span<llvm::Value* const> arguments);
 
   // Emits a call to a non-variadic function `func_name` with arguments
   // `arguments` assuming C calling convention.
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 5c63f90133c..5e1a3aaddf0 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -237,7 +237,10 @@ namespace {
 bool RegisterKnownJITSymbols() {
   xla::CustomCallTargetRegistry* registry =
       xla::CustomCallTargetRegistry::Global();
+  registry->Register("fprintf", reinterpret_cast<void*>(&fprintf), "Host");
   registry->Register("printf", reinterpret_cast<void*>(&printf), "Host");
+  registry->Register("stderr", reinterpret_cast<void*>(&stderr), "Host");
+  registry->Register("puts", reinterpret_cast<void*>(&puts), "Host");
 
 #define REGISTER_CPU_RUNTIME_SYMBOL(base_name)                               \
   do {                                                                       \
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 1c417bc31f8..cab779858ab 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -1424,25 +1424,24 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
   auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
   auto one = llvm::ConstantFP::get(type, 1.0);
   auto half = llvm::ConstantFP::get(type, 0.5);
-  // When the exponent is large, the naive evaluation of e^(x) - 1 is more
-  // accurate than the Taylor series.
-  TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value, ""));
-  auto for_large_x = FSub(exp_x, one);
-  // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + ….
-  // We want exp(x)-1 which is x + x^2/2 + x^3/6 + ….
-  // We use the second degree approximation of exp(x)-1 = x + x^2/2.
-  auto x_squared = FMul(x, x);
-  auto x_squared_over_two = FMul(x_squared, half);
-  auto for_small_x = FAdd(x, x_squared_over_two);
-  // At this point, the relative errors due to floating point precision loss of
-  // calculating exp(x) - 1 and the polynomial exp(x)-1 = x + x^2/2 are about
-  // equal, with a value of approximately 2^-16.
-  const auto kExponentIsSmallThreshold = 0.009;
+  auto zero = llvm::ConstantFP::get(type, 0.0);
+
+  // expm1(x) == tanh(x/2)*(exp(x)+1)
+  // x/2 can underflow, if it does we approximate expm1 with x.
+  auto x_over_two = FMul(x, half);
+  auto x_over_two_is_zero = FCmpOEQ(x_over_two, zero);
   auto abs_x =
-      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
-  auto x_is_small =
-      FCmpOLT(abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
-  return Select(x_is_small, for_small_x, for_large_x);
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {x}, {type}, b_);
+  // Use a naive exp(x)-1 calculation if |x| is > 0.5
+  auto x_magnitude_is_large = FCmpOGT(abs_x, half);
+  TF_ASSIGN_OR_RETURN(auto tanh_of_x_over_two, EmitTanh(prim_type, x_over_two));
+  TF_ASSIGN_OR_RETURN(auto exp_of_x, EmitExp(prim_type, x, ""));
+  auto exp_of_x_plus_one = FAdd(exp_of_x, one);
+  auto exp_of_x_minus_one = FSub(exp_of_x, one);
+  auto expm1_of_x = FMul(tanh_of_x_over_two, exp_of_x_plus_one);
+  expm1_of_x = Select(x_magnitude_is_large, exp_of_x_minus_one, expm1_of_x);
+  expm1_of_x = Select(x_over_two_is_zero, x, expm1_of_x);
+  return expm1_of_x;
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
index 77441fe8b2c..ed26d05dc66 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test_f32_or_smaller.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
+
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -300,7 +302,15 @@ UNARY_TEST_FLOAT_32_BITS_OR_LESS(Exp, {
 UNARY_TEST_FLOAT_32_BITS_OR_LESS(Expm1, {
   ErrorSpecGen error_spec_gen = GetDefaultSpecGenerator();
   if (ty_ == F32) {
-    error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
+    if (platform_ == "Host") {
+      error_spec_gen = +[](NativeT x) {
+        // We expect no worse than an error of 8 ULPs.
+        return ErrorSpec{
+            0.0, std::scalbn(8.0f, -std::numeric_limits<float>::digits)};
+      };
+    } else {
+      error_spec_gen = +[](NativeT x) { return ErrorSpec{0, 0.00015}; };
+    }
   }
 
   // Our CPU implementation of expm1 returns one incorrect value: says