[TF:XLA] Implement FloorMod with fewer calls to xla::Rem

This implementation is faster as xla::Rem is rather expensive. It is also more numerically sound as it has fewer rounding steps. For example, given the single precision values: x = -1.46146206e-09 y = 0.562811792 old FloorMod(x, y) would compute 0 new FloorMod(x, y) computes 0.56281179 This agrees with numpy.mod which returns the element-wise remainder of the quotient floor_divide(x, y). PiperOrigin-RevId: 239061236
2019-03-18 14:27:07 -07:00 · 2019-03-18 14:27:07 -07:00 · 955bc76f9e
commit 955bc76f9e
parent e1e6ec9c2c
1 changed files with 6 additions and 3 deletions
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@ -148,14 +148,17 @@ XLA_MAKE_BINARY(Xdivy, XdivyImpl(lhs, rhs, broadcast_helper));

 // Implementation of FloorMod. Pseudo-code:
 // T trunc_mod = std::fmod(x, y);
-// return (x < T(0)) == (y < T(0)) ? trunc_mod : std::fmod(trunc_mod + y, y);
+// return trunc_mod != 0 && (y < 0 != trunc_mod < 0) ? trunc_mod + y
+//                                                   : trunc_mod;
 static xla::XlaOp FloorModImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                               xla::XlaOp y, const BCast& broadcast_helper) {
  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
  auto zero = XlaHelpers::Zero(b, dtype);
-  auto same_sign = xla::Eq(xla::Lt(x, zero), xla::Lt(y, zero));
  auto trunc_mod = xla::Rem(x, y);
-  return xla::Select(same_sign, trunc_mod, xla::Rem(xla::Add(trunc_mod, y), y));
+  auto trunc_mod_not_zero = xla::Ne(trunc_mod, zero);
+  auto do_plus = xla::And(xla::Ne(xla::Lt(trunc_mod, zero), xla::Lt(y, zero)),
+                          trunc_mod_not_zero);
+  return xla::Select(do_plus, xla::Add(trunc_mod, y), trunc_mod);
 }
 XLA_MAKE_BINARY(FloorMod,
                FloorModImpl(b, input_type(0), lhs, rhs, broadcast_helper));