Add bfloat16 support to more cwise CPU ops

PiperOrigin-RevId: 325248465 Change-Id: I68b3be2af4f9acedb76ab6077bf5dac9ac6eeb72
2020-08-06 09:41:59 -07:00 · 2020-08-06 09:41:59 -07:00 · 696a4a76ce
commit 696a4a76ce
parent 3f24d131d7
23 changed files with 112 additions and 62 deletions
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
+          double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@ -156,6 +156,7 @@ struct TernaryClipOp<CPUDevice, T> {
 INSTANTIATE_CPU(Eigen::half);
 INSTANTIATE_CPU(float);
 INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(bfloat16);
 INSTANTIATE_CPU(int8);
 INSTANTIATE_CPU(int16);
 INSTANTIATE_CPU(int32);
@ -173,6 +174,7 @@ INSTANTIATE_CPU(uint16);
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(bfloat16);
 REGISTER_CPU_KERNEL(int8);
 REGISTER_CPU_KERNEL(int16);
 REGISTER_CPU_KERNEL(int32);
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
+          complex64, complex128);

 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, bfloat16,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
+          double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
          int8, int16, int32, int64);
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, bfloat16, double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64,
          uint64);
-REGISTER2(BinaryOp, CPU, "FloorMod", functor::floor_fmod, float, double);
+REGISTER3(BinaryOp, CPU, "FloorMod", functor::floor_fmod, bfloat16, float,
+          double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A special GPU kernel for int32.
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
-          double);
+REGISTER4(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
+          bfloat16, double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
+          double);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@ -30,15 +30,8 @@ REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
          Eigen::half, double);
 #endif

-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-// Since Eigen backend does not support bfloat16 ops, we are selectively
-// enabling them for MKL backend.
 REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128, bfloat16);
-#else
-REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
+          bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
          double, int64);
@ -47,8 +40,8 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
 #endif  // TENSORFLOW_USE_SYCL

-REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
          Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
-          complex64, Eigen::half, complex128);
+REGISTER8(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
+          complex64, Eigen::half, bfloat16, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
          int64, complex64, complex128);
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
+          complex64, complex128);

 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"

 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, bfloat16,
+          double, complex64, complex128);

 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
@ -30,8 +30,8 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
 #endif  // TENSORFLOW_USE_SYCL

-REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "TanhGrad", functor::tanh_grad, float,
          Eigen::half, double);
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@ -67,6 +67,7 @@ class ClipTest(test.TestCase):
        dtypes.float16,
        dtypes.float32,
        dtypes.float64,
+        dtypes.bfloat16,
        dtypes.int16,
        dtypes.int32,
        dtypes.int64,
@ -88,6 +89,7 @@ class ClipTest(test.TestCase):
        dtypes.float16,
        dtypes.float32,
        dtypes.float64,
+        dtypes.bfloat16,
        dtypes.int16,
        dtypes.int32,
        dtypes.int64,
@ -110,6 +112,7 @@ class ClipTest(test.TestCase):
        dtypes.float16,
        dtypes.float32,
        dtypes.float64,
+        dtypes.bfloat16,
        dtypes.int16,
        dtypes.int32,
        dtypes.int64,
@ -132,6 +135,7 @@ class ClipTest(test.TestCase):
        dtypes.float16,
        dtypes.float32,
        dtypes.float64,
+        dtypes.bfloat16,
        dtypes.int16,
        dtypes.int32,
        dtypes.int64,
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@ -841,6 +841,9 @@ class MathOpsOverloadTest(test.TestCase):

  def _compareBinary(self, x, y, dtype, np_func, tf_func):
    np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
+    if dtype == dtypes_lib.bfloat16:
+      # assertAllClose does not properly handle bfloat16 values
+      np_ans = np_ans.astype(np.float32)
    self.assertAllClose(np_ans,
                        self._computeTensorAndLiteral(x, y, dtype, tf_func))
    self.assertAllClose(np_ans,
@ -857,6 +860,7 @@ class MathOpsOverloadTest(test.TestCase):
        dtypes_lib.float16,
        dtypes_lib.float32,
        dtypes_lib.float64,
+        dtypes_lib.bfloat16,
        dtypes_lib.int32,
        dtypes_lib.int64,
        dtypes_lib.complex64,
@ -920,12 +924,16 @@ class MathOpsOverloadTest(test.TestCase):
 class IsFiniteInfNanTest(test.TestCase):

  def _compare(self, x, use_gpu):
-    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
    with test_util.device(use_gpu=use_gpu):
      inx = ops.convert_to_tensor(x)
      ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
          inx), math_ops.is_nan(inx)
      tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
+    if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+      # Numpy will implicitly convert bfloat16 value to float16, so we cast to
+      # float32 to avoid this.
+      x = x.astype(np.float32)
+    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
    self.assertAllEqual(np_inf, tf_inf)
    self.assertAllEqual(np_nan, tf_nan)
    self.assertAllEqual(np_finite, tf_finite)
@ -934,11 +942,18 @@ class IsFiniteInfNanTest(test.TestCase):
    self.assertShapeEqual(np_finite, ofinite)

  def _testDtype(self, dtype):
-    fi = np.finfo(dtype)
-    data = np.array([
-        0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
-        np.inf, np.nan
-    ]).astype(dtype)
+    if dtype != dtypes_lib.bfloat16.as_numpy_dtype:
+      fi = np.finfo(dtype)
+      data = np.array([
+          0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
+          np.inf, np.nan
+      ]).astype(dtype)
+    else:
+      # np.finfo does not support bfloat16
+      data = np.array([
+          0, -1, 1, 0.01, -0.01, -3.3895e+38, 3.3895e+38, -np.inf, np.inf,
+          np.nan
+      ]).astype(dtype)
    self._compare(data, use_gpu=False)
    self._compare(data, use_gpu=True)

@ -951,6 +966,9 @@ class IsFiniteInfNanTest(test.TestCase):
  def testDouble(self):
    self._testDtype(np.float64)

+  def testBfloat16(self):
+    self._testDtype(dtypes_lib.bfloat16.as_numpy_dtype)
+
  def testSqrt(self):
    for dtype in [np.float16, np.float32, np.float64]:
      fi = np.finfo(dtype)
@ -998,8 +1016,8 @@ class RoundingTest(test.TestCase):
  def _testDtype(self, dtype):
    data = (np.arange(-3, 3) / 4.).reshape(1, 3, 2).astype(dtype)
    self._compare(data)
-    # TODO: rint op is not supported for float16
-    if dtype is np.float16:
+    # TODO(reedwm): rint op is not supported for float16 and bfloat16
+    if dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
      return
    self._compare_values(data)
    x = [0.5, 0.5000001]
@ -1012,10 +1030,12 @@ class RoundingTest(test.TestCase):
    self._compare_values(x, y=y)

  def testTypes(self):
-    self.skipTest("b/131162241")
-    for dtype in [np.float16, np.float32, np.float64]:
-      with self.subTest(dtype=dtype):
-        self._testDtype(dtype)
+    # TODO(b/131162241): Enable test for GPU
+    with ops.device("/CPU:0"):
+      for dtype in [np.float16, np.float32, np.float64,
+                    dtypes_lib.bfloat16.as_numpy_dtype]:
+        with self.subTest(dtype=dtype):
+          self._testDtype(dtype)


 class ComplexMakeRealImagTest(test.TestCase):
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@ -61,6 +61,8 @@ def _default_tolerance(dtype):
  Args:
    dtype: A datatype.
  """
+  if dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+    return 5e-3
  if dtype == np.float16:
    return 5e-3
  elif dtype in (np.float32, np.complex64):
@ -81,12 +83,7 @@ class UnaryOpTest(test.TestCase):
    np_ans = np_func(x)
    with self.cached_session(use_gpu=False):
      inx = ops.convert_to_tensor(x)
-      if x.dtype in (np.float32, np.float64,
-                     dtypes_lib.bfloat16.as_numpy_dtype):
-        y = 1.1 * tf_func(inx)
-        np_ans *= 1.1
-      else:
-        y = tf_func(inx)
+      y = tf_func(inx)
      tf_cpu = self.evaluate(y)
      self.assertShapeEqual(np_ans, y)
      if x.dtype == np.float16:
@ -99,7 +96,7 @@ class UnaryOpTest(test.TestCase):
      if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
        return  # Return early

-      if x.dtype == np.float16:
+      if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
        s = list(np.shape(x))
        jacob_t, _ = gradient_checker.compute_gradient(
            inx, s, y, s, x_init_value=x)
@ -108,7 +105,7 @@ class UnaryOpTest(test.TestCase):
        yf = tf_func(inxf)
        _, jacob_n = gradient_checker.compute_gradient(
            inxf, s, yf, s, x_init_value=xf, delta=1e-2)
-        jacob_n = jacob_n.astype(np.float16)
+        jacob_n = jacob_n.astype(x.dtype)
        self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
      elif x.dtype in (np.float32, np.complex64):
        s = list(np.shape(x))
@ -384,13 +381,36 @@ class UnaryOpTest(test.TestCase):
    self._compareBothSparse(y, np.sign, math_ops.sign)
    self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)

+  @test_util.run_deprecated_v1
  def testBFloat16Basic(self):
+    def compute_f32(np_func):
+      """Decorator to compute Numpy function with float32 math."""
+      def f(x):
+        y = np_func(x.astype(np.float32))
+        return y.astype(x.dtype)
+      return f
+
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
    x = np.arange(-6, 6,
                  2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
+    y = (x + .5).astype(bfloat16)  # no zero
+    z = (x + 15.5).astype(bfloat16)  # all positive
    self._compareCpu(x, np.abs, math_ops.abs)
    self._compareCpu(x, np.abs, _ABS)
    self._compareBoth(x, np.negative, math_ops.negative)
    self._compareBoth(x, np.negative, _NEG)
+    self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareCpu(x, np.expm1, math_ops.expm1)
+    self._compareCpu(z, compute_f32(np.log), math_ops.log)
+    self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
+    self._compareCpu(y, np.sign, math_ops.sign)
+    self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
+    self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
+    self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
+    self._compareBoth(x, compute_f32(np.sinh), math_ops.sinh)
+    self._compareBoth(x, compute_f32(np.cosh), math_ops.cosh)
+    self._compareBoth(x, compute_f32(np.tanh), math_ops.tanh)

  def testInt8Basic(self):
    x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@ -476,6 +476,13 @@ class DivAndModTest(test_util.TensorFlowTestCase):
    #               % array_ops.constant(divs))
    # self.assertAllEqual(tf2_result, tf_result)

+  def testFloorModBfloat64(self):
+    nums, divs = self.floatTestData()
+    tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
+                                  math_ops.cast(divs, dtypes.bfloat16))
+    np_result = nums % divs
+    self.assertAllEqual(tf_result, np_result)
+
  def testTruncateModInt(self):
    nums, divs = self.intTestData()
    tf_result = math_ops.truncatemod(nums, divs)