diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 4b1847d758c..f8907ff1baa 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index c0c71c5f638..3d43cf147b1 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -156,6 +156,7 @@ struct TernaryClipOp<CPUDevice, T> {
 INSTANTIATE_CPU(Eigen::half);
 INSTANTIATE_CPU(float);
 INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(bfloat16);
 INSTANTIATE_CPU(int8);
 INSTANTIATE_CPU(int16);
 INSTANTIATE_CPU(int32);
@@ -173,6 +174,7 @@ INSTANTIATE_CPU(uint16);
 REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(bfloat16);
 REGISTER_CPU_KERNEL(int8);
 REGISTER_CPU_KERNEL(int16);
 REGISTER_CPU_KERNEL(int32);
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 7b434ce4294..3d406fe040a 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index 3388df0096a..e6dff0ea317 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
+          complex64, complex128);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 2b157f0e7a9..d937dd0c06d 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 55fdc4763d3..0b145d83e5c 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 25210a0fa51..1dbd9bf0634 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 11869e43eaa..d1f6d4c0652 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
           int8, int16, int32, int64);
-REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
-          Eigen::half, double);
+REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
+          Eigen::half, bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 3305f54bcca..599ed1a9318 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER3(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64,
           uint64);
-REGISTER2(BinaryOp, CPU, "FloorMod", functor::floor_fmod, float, double);
+REGISTER3(BinaryOp, CPU, "FloorMod", functor::floor_fmod, bfloat16, float,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 061dc8367e2..42c7cbd4fd7 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
-          double);
+REGISTER4(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
+          bfloat16, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index f87a24d2085..68141f4924a 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
+REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
+          double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 06fc764fc75..88ddfd6af26 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 1b1d626aa57..214d083e11b 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8e92691474a..4fe201e9c7b 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -30,15 +30,8 @@ REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
 
-#ifdef ENABLE_INTEL_MKL_BFLOAT16
-// Since Eigen backend does not support bfloat16 ops, we are selectively
-// enabling them for MKL backend.
 REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128, bfloat16);
-#else
-REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
-          double, complex64, complex128);
-#endif  // ENABLE_INTEL_MKL_BFLOAT16
+          bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
@@ -47,8 +40,8 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 983cee4c944..200a56eb2d2 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
-          complex64, Eigen::half, complex128);
+REGISTER8(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
+          complex64, Eigen::half, bfloat16, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
           int64, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index ab6fb1ccd5e..f0fc2af7366 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 114a6142bdc..4448d2fef76 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
-          complex128);
+REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
+          complex64, complex128);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                \
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index d9793501a09..115531213ac 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 1b6da56e537..de56a5e3e03 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, bfloat16,
+          double, complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
@@ -30,8 +30,8 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
 #endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
+          Eigen::half, bfloat16, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double);
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 8d6b475c914..d0c805f96e3 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -67,6 +67,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -88,6 +89,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -110,6 +112,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
@@ -132,6 +135,7 @@ class ClipTest(test.TestCase):
         dtypes.float16,
         dtypes.float32,
         dtypes.float64,
+        dtypes.bfloat16,
         dtypes.int16,
         dtypes.int32,
         dtypes.int64,
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 78d3af17990..8d628d448db 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -841,6 +841,9 @@ class MathOpsOverloadTest(test.TestCase):
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
     np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
+    if dtype == dtypes_lib.bfloat16:
+      # assertAllClose does not properly handle bfloat16 values
+      np_ans = np_ans.astype(np.float32)
     self.assertAllClose(np_ans,
                         self._computeTensorAndLiteral(x, y, dtype, tf_func))
     self.assertAllClose(np_ans,
@@ -857,6 +860,7 @@ class MathOpsOverloadTest(test.TestCase):
         dtypes_lib.float16,
         dtypes_lib.float32,
         dtypes_lib.float64,
+        dtypes_lib.bfloat16,
         dtypes_lib.int32,
         dtypes_lib.int64,
         dtypes_lib.complex64,
@@ -920,12 +924,16 @@ class MathOpsOverloadTest(test.TestCase):
 class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
-    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
       tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
+    if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+      # Numpy will implicitly convert bfloat16 value to float16, so we cast to
+      # float32 to avoid this.
+      x = x.astype(np.float32)
+    np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -934,11 +942,18 @@ class IsFiniteInfNanTest(test.TestCase):
     self.assertShapeEqual(np_finite, ofinite)
 
   def _testDtype(self, dtype):
-    fi = np.finfo(dtype)
-    data = np.array([
-        0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
-        np.inf, np.nan
-    ]).astype(dtype)
+    if dtype != dtypes_lib.bfloat16.as_numpy_dtype:
+      fi = np.finfo(dtype)
+      data = np.array([
+          0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
+          np.inf, np.nan
+      ]).astype(dtype)
+    else:
+      # np.finfo does not support bfloat16
+      data = np.array([
+          0, -1, 1, 0.01, -0.01, -3.3895e+38, 3.3895e+38, -np.inf, np.inf,
+          np.nan
+      ]).astype(dtype)
     self._compare(data, use_gpu=False)
     self._compare(data, use_gpu=True)
 
@@ -951,6 +966,9 @@ class IsFiniteInfNanTest(test.TestCase):
   def testDouble(self):
     self._testDtype(np.float64)
 
+  def testBfloat16(self):
+    self._testDtype(dtypes_lib.bfloat16.as_numpy_dtype)
+
   def testSqrt(self):
     for dtype in [np.float16, np.float32, np.float64]:
       fi = np.finfo(dtype)
@@ -998,8 +1016,8 @@ class RoundingTest(test.TestCase):
   def _testDtype(self, dtype):
     data = (np.arange(-3, 3) / 4.).reshape(1, 3, 2).astype(dtype)
     self._compare(data)
-    # TODO: rint op is not supported for float16
-    if dtype is np.float16:
+    # TODO(reedwm): rint op is not supported for float16 and bfloat16
+    if dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
       return
     self._compare_values(data)
     x = [0.5, 0.5000001]
@@ -1012,10 +1030,12 @@ class RoundingTest(test.TestCase):
     self._compare_values(x, y=y)
 
   def testTypes(self):
-    self.skipTest("b/131162241")
-    for dtype in [np.float16, np.float32, np.float64]:
-      with self.subTest(dtype=dtype):
-        self._testDtype(dtype)
+    # TODO(b/131162241): Enable test for GPU
+    with ops.device("/CPU:0"):
+      for dtype in [np.float16, np.float32, np.float64,
+                    dtypes_lib.bfloat16.as_numpy_dtype]:
+        with self.subTest(dtype=dtype):
+          self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index df848a653d4..9d46ed35639 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -61,6 +61,8 @@ def _default_tolerance(dtype):
   Args:
     dtype: A datatype.
   """
+  if dtype == dtypes_lib.bfloat16.as_numpy_dtype:
+    return 5e-3
   if dtype == np.float16:
     return 5e-3
   elif dtype in (np.float32, np.complex64):
@@ -81,12 +83,7 @@ class UnaryOpTest(test.TestCase):
     np_ans = np_func(x)
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
-      if x.dtype in (np.float32, np.float64,
-                     dtypes_lib.bfloat16.as_numpy_dtype):
-        y = 1.1 * tf_func(inx)
-        np_ans *= 1.1
-      else:
-        y = tf_func(inx)
+      y = tf_func(inx)
       tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
@@ -99,7 +96,7 @@ class UnaryOpTest(test.TestCase):
       if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
         return  # Return early
 
-      if x.dtype == np.float16:
+      if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
         s = list(np.shape(x))
         jacob_t, _ = gradient_checker.compute_gradient(
             inx, s, y, s, x_init_value=x)
@@ -108,7 +105,7 @@ class UnaryOpTest(test.TestCase):
         yf = tf_func(inxf)
         _, jacob_n = gradient_checker.compute_gradient(
             inxf, s, yf, s, x_init_value=xf, delta=1e-2)
-        jacob_n = jacob_n.astype(np.float16)
+        jacob_n = jacob_n.astype(x.dtype)
         self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
       elif x.dtype in (np.float32, np.complex64):
         s = list(np.shape(x))
@@ -384,13 +381,36 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testBFloat16Basic(self):
+    def compute_f32(np_func):
+      """Decorator to compute Numpy function with float32 math."""
+      def f(x):
+        y = np_func(x.astype(np.float32))
+        return y.astype(x.dtype)
+      return f
+
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     x = np.arange(-6, 6,
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
+    y = (x + .5).astype(bfloat16)  # no zero
+    z = (x + 15.5).astype(bfloat16)  # all positive
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
     self._compareBoth(x, np.negative, _NEG)
+    self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
+    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareCpu(x, np.expm1, math_ops.expm1)
+    self._compareCpu(z, compute_f32(np.log), math_ops.log)
+    self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
+    self._compareCpu(y, np.sign, math_ops.sign)
+    self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
+    self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
+    self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
+    self._compareBoth(x, compute_f32(np.sinh), math_ops.sinh)
+    self._compareBoth(x, compute_f32(np.cosh), math_ops.cosh)
+    self._compareBoth(x, compute_f32(np.tanh), math_ops.tanh)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 296395d034f..dabf4bb9d33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -476,6 +476,13 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     #               % array_ops.constant(divs))
     # self.assertAllEqual(tf2_result, tf_result)
 
+  def testFloorModBfloat64(self):
+    nums, divs = self.floatTestData()
+    tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
+                                  math_ops.cast(divs, dtypes.bfloat16))
+    np_result = nums % divs
+    self.assertAllEqual(tf_result, np_result)
+
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     tf_result = math_ops.truncatemod(nums, divs)